LCOV - differential code coverage report
Current view: top level - src/backend/utils/mb - mbutils.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB
Current: Differential Code Coverage HEAD vs 15 Lines: 63.8 % 539 344 8 32 118 37 25 213 11 95 133 217
Current Date: 2023-04-08 15:15:32 Functions: 81.8 % 55 45 10 44 1 10 45
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * mbutils.c
       4                 :  *    This file contains functions for encoding conversion.
       5                 :  *
       6                 :  * The string-conversion functions in this file share some API quirks.
       7                 :  * Note the following:
       8                 :  *
       9                 :  * The functions return a palloc'd, null-terminated string if conversion
      10                 :  * is required.  However, if no conversion is performed, the given source
      11                 :  * string pointer is returned as-is.
      12                 :  *
      13                 :  * Although the presence of a length argument means that callers can pass
      14                 :  * non-null-terminated strings, care is required because the same string
      15                 :  * will be passed back if no conversion occurs.  Such callers *must* check
      16                 :  * whether result == src and handle that case differently.
      17                 :  *
      18                 :  * If the source and destination encodings are the same, the source string
      19                 :  * is returned without any verification; it's assumed to be valid data.
      20                 :  * If that might not be the case, the caller is responsible for validating
      21                 :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22                 :  * source and destination encodings are different, the functions ensure that
      23                 :  * the result is validly encoded according to the destination encoding.
      24                 :  *
      25                 :  *
      26                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
      27                 :  * Portions Copyright (c) 1994, Regents of the University of California
      28                 :  *
      29                 :  *
      30                 :  * IDENTIFICATION
      31                 :  *    src/backend/utils/mb/mbutils.c
      32                 :  *
      33                 :  *-------------------------------------------------------------------------
      34                 :  */
      35                 : #include "postgres.h"
      36                 : 
      37                 : #include "access/xact.h"
      38                 : #include "catalog/namespace.h"
      39                 : #include "mb/pg_wchar.h"
      40                 : #include "utils/builtins.h"
      41                 : #include "utils/memutils.h"
      42                 : #include "utils/syscache.h"
      43                 : #include "varatt.h"
      44                 : 
      45                 : /*
      46                 :  * We maintain a simple linked list caching the fmgr lookup info for the
      47                 :  * currently selected conversion functions, as well as any that have been
      48                 :  * selected previously in the current session.  (We remember previous
      49                 :  * settings because we must be able to restore a previous setting during
      50                 :  * transaction rollback, without doing any fresh catalog accesses.)
      51                 :  *
      52                 :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      53                 :  */
      54                 : typedef struct ConvProcInfo
      55                 : {
      56                 :     int         s_encoding;     /* server and client encoding IDs */
      57                 :     int         c_encoding;
      58                 :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      59                 :     FmgrInfo    to_client_info;
      60                 : } ConvProcInfo;
      61                 : 
      62                 : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      63                 : 
      64                 : /*
      65                 :  * These variables point to the currently active conversion functions,
      66                 :  * or are NULL when no conversion is needed.
      67                 :  */
      68                 : static FmgrInfo *ToServerConvProc = NULL;
      69                 : static FmgrInfo *ToClientConvProc = NULL;
      70                 : 
      71                 : /*
      72                 :  * This variable stores the conversion function to convert from UTF-8
      73                 :  * to the server encoding.  It's NULL if the server encoding *is* UTF-8,
      74                 :  * or if we lack a conversion function for this.
      75                 :  */
      76                 : static FmgrInfo *Utf8ToServerConvProc = NULL;
      77                 : 
      78                 : /*
      79                 :  * These variables track the currently-selected encodings.
      80                 :  */
      81                 : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      82                 : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      83                 : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      84                 : 
      85                 : /*
      86                 :  * During backend startup we can't set client encoding because we (a)
      87                 :  * can't look up the conversion functions, and (b) may not know the database
      88                 :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      89                 :  * remembers it for InitializeClientEncoding() to apply later.
      90                 :  */
      91                 : static bool backend_startup_complete = false;
      92                 : static int  pending_client_encoding = PG_SQL_ASCII;
      93                 : 
      94                 : 
      95                 : /* Internal functions */
      96                 : static char *perform_default_encoding_conversion(const char *src,
      97                 :                                                  int len, bool is_client_to_server);
      98                 : static int  cliplen(const char *str, int len, int limit);
      99                 : 
     100                 : 
     101                 : /*
     102                 :  * Prepare for a future call to SetClientEncoding.  Success should mean
     103                 :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     104                 :  *
     105                 :  * (But note that success before backend_startup_complete does not guarantee
     106                 :  * success after ...)
     107                 :  *
     108                 :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     109                 :  */
     110                 : int
     111 GIC       26562 : PrepareClientEncoding(int encoding)
     112 ECB             : {
     113                 :     int         current_server_encoding;
     114                 :     ListCell   *lc;
     115                 : 
     116 GIC       26562 :     if (!PG_VALID_FE_ENCODING(encoding))
     117 LBC           0 :         return -1;
     118 EUB             : 
     119                 :     /* Can't do anything during startup, per notes above */
     120 GIC       26562 :     if (!backend_startup_complete)
     121 CBC       12948 :         return 0;
     122 ECB             : 
     123 GIC       13614 :     current_server_encoding = GetDatabaseEncoding();
     124 ECB             : 
     125                 :     /*
     126                 :      * Check for cases that require no conversion function.
     127                 :      */
     128 GIC       13614 :     if (current_server_encoding == encoding ||
     129 CBC        2601 :         current_server_encoding == PG_SQL_ASCII ||
     130 ECB             :         encoding == PG_SQL_ASCII)
     131 GIC       13610 :         return 0;
     132 ECB             : 
     133 GIC           4 :     if (IsTransactionState())
     134 ECB             :     {
     135                 :         /*
     136                 :          * If we're in a live transaction, it's safe to access the catalogs,
     137                 :          * so look up the functions.  We repeat the lookup even if the info is
     138                 :          * already cached, so that we can react to changes in the contents of
     139                 :          * pg_conversion.
     140                 :          */
     141                 :         Oid         to_server_proc,
     142                 :                     to_client_proc;
     143                 :         ConvProcInfo *convinfo;
     144                 :         MemoryContext oldcontext;
     145                 : 
     146 GIC           4 :         to_server_proc = FindDefaultConversionProc(encoding,
     147 ECB             :                                                    current_server_encoding);
     148 GIC           4 :         if (!OidIsValid(to_server_proc))
     149 LBC           0 :             return -1;
     150 GBC           4 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     151 ECB             :                                                    encoding);
     152 GIC           4 :         if (!OidIsValid(to_client_proc))
     153 LBC           0 :             return -1;
     154 EUB             : 
     155                 :         /*
     156                 :          * Load the fmgr info into TopMemoryContext (could still fail here)
     157                 :          */
     158 GIC           4 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     159 ECB             :                                                        sizeof(ConvProcInfo));
     160 GIC           4 :         convinfo->s_encoding = current_server_encoding;
     161 CBC           4 :         convinfo->c_encoding = encoding;
     162               4 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     163 ECB             :                       TopMemoryContext);
     164 GIC           4 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     165 ECB             :                       TopMemoryContext);
     166                 : 
     167                 :         /* Attach new info to head of list */
     168 GIC           4 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     169 CBC           4 :         ConvProcList = lcons(convinfo, ConvProcList);
     170               4 :         MemoryContextSwitchTo(oldcontext);
     171 ECB             : 
     172                 :         /*
     173                 :          * We cannot yet remove any older entry for the same encoding pair,
     174                 :          * since it could still be in use.  SetClientEncoding will clean up.
     175                 :          */
     176                 : 
     177 GIC           4 :         return 0;               /* success */
     178 ECB             :     }
     179                 :     else
     180                 :     {
     181                 :         /*
     182                 :          * If we're not in a live transaction, the only thing we can do is
     183                 :          * restore a previous setting using the cache.  This covers all
     184                 :          * transaction-rollback cases.  The only case it might not work for is
     185                 :          * trying to change client_encoding on the fly by editing
     186                 :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     187                 :          * thing to do anyway.
     188                 :          */
     189 UIC           0 :         foreach(lc, ConvProcList)
     190 EUB             :         {
     191 UIC           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     192 EUB             : 
     193 UIC           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     194 UBC           0 :                 oldinfo->c_encoding == encoding)
     195               0 :                 return 0;
     196 EUB             :         }
     197                 : 
     198 UIC           0 :         return -1;              /* it's not cached, so fail */
     199 EUB             :     }
     200                 : }
     201                 : 
     202                 : /*
     203                 :  * Set the active client encoding and set up the conversion-function pointers.
     204                 :  * PrepareClientEncoding should have been called previously for this encoding.
     205                 :  *
     206                 :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     207                 :  */
     208                 : int
     209 GIC       23873 : SetClientEncoding(int encoding)
     210 ECB             : {
     211                 :     int         current_server_encoding;
     212                 :     bool        found;
     213                 :     ListCell   *lc;
     214                 : 
     215 GIC       23873 :     if (!PG_VALID_FE_ENCODING(encoding))
     216 LBC           0 :         return -1;
     217 EUB             : 
     218                 :     /* Can't do anything during startup, per notes above */
     219 GIC       23873 :     if (!backend_startup_complete)
     220 ECB             :     {
     221 GIC       11559 :         pending_client_encoding = encoding;
     222 CBC       11559 :         return 0;
     223 ECB             :     }
     224                 : 
     225 GIC       12314 :     current_server_encoding = GetDatabaseEncoding();
     226 ECB             : 
     227                 :     /*
     228                 :      * Check for cases that require no conversion function.
     229                 :      */
     230 GIC       12314 :     if (current_server_encoding == encoding ||
     231 CBC        1303 :         current_server_encoding == PG_SQL_ASCII ||
     232 ECB             :         encoding == PG_SQL_ASCII)
     233                 :     {
     234 GIC       12310 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     235 CBC       12310 :         ToServerConvProc = NULL;
     236           12310 :         ToClientConvProc = NULL;
     237           12310 :         return 0;
     238 ECB             :     }
     239                 : 
     240                 :     /*
     241                 :      * Search the cache for the entry previously prepared by
     242                 :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     243                 :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     244                 :      * leak memory.
     245                 :      */
     246 GIC           4 :     found = false;
     247 CBC           8 :     foreach(lc, ConvProcList)
     248 ECB             :     {
     249 GIC           4 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     250 ECB             : 
     251 GIC           4 :         if (convinfo->s_encoding == current_server_encoding &&
     252 CBC           4 :             convinfo->c_encoding == encoding)
     253 ECB             :         {
     254 GIC           4 :             if (!found)
     255 ECB             :             {
     256                 :                 /* Found newest entry, so set up */
     257 GIC           4 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     258 CBC           4 :                 ToServerConvProc = &convinfo->to_server_info;
     259               4 :                 ToClientConvProc = &convinfo->to_client_info;
     260               4 :                 found = true;
     261 ECB             :             }
     262                 :             else
     263                 :             {
     264                 :                 /* Duplicate entry, release it */
     265 UIC           0 :                 ConvProcList = foreach_delete_current(ConvProcList, lc);
     266 UBC           0 :                 pfree(convinfo);
     267 EUB             :             }
     268                 :         }
     269                 :     }
     270                 : 
     271 GIC           4 :     if (found)
     272 CBC           4 :         return 0;               /* success */
     273 ECB             :     else
     274 UIC           0 :         return -1;              /* it's not cached, so fail */
     275 EUB             : }
     276                 : 
     277                 : /*
     278                 :  * Initialize client encoding conversions.
     279                 :  *      Called from InitPostgres() once during backend startup.
     280                 :  */
     281                 : void
     282 GIC       10866 : InitializeClientEncoding(void)
     283 ECB             : {
     284                 :     int         current_server_encoding;
     285                 : 
     286 GIC       10866 :     Assert(!backend_startup_complete);
     287 CBC       10866 :     backend_startup_complete = true;
     288 ECB             : 
     289 GIC       21732 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     290 CBC       10866 :         SetClientEncoding(pending_client_encoding) < 0)
     291 ECB             :     {
     292                 :         /*
     293                 :          * Oops, the requested conversion is not available. We couldn't fail
     294                 :          * before, but we can now.
     295                 :          */
     296 UIC           0 :         ereport(FATAL,
     297 EUB             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     298                 :                  errmsg("conversion between %s and %s is not supported",
     299                 :                         pg_enc2name_tbl[pending_client_encoding].name,
     300                 :                         GetDatabaseEncodingName())));
     301                 :     }
     302                 : 
     303                 :     /*
     304                 :      * Also look up the UTF8-to-server conversion function if needed.  Since
     305                 :      * the server encoding is fixed within any one backend process, we don't
     306                 :      * have to do this more than once.
     307                 :      */
     308 GIC       10866 :     current_server_encoding = GetDatabaseEncoding();
     309 CBC       10866 :     if (current_server_encoding != PG_UTF8 &&
     310 ECB             :         current_server_encoding != PG_SQL_ASCII)
     311                 :     {
     312                 :         Oid         utf8_to_server_proc;
     313                 : 
     314 GIC         101 :         Assert(IsTransactionState());
     315 ECB             :         utf8_to_server_proc =
     316 GIC         101 :             FindDefaultConversionProc(PG_UTF8,
     317 ECB             :                                       current_server_encoding);
     318                 :         /* If there's no such conversion, just leave the pointer as NULL */
     319 GIC         101 :         if (OidIsValid(utf8_to_server_proc))
     320 ECB             :         {
     321                 :             FmgrInfo   *finfo;
     322                 : 
     323 GIC         101 :             finfo = (FmgrInfo *) MemoryContextAlloc(TopMemoryContext,
     324 ECB             :                                                     sizeof(FmgrInfo));
     325 GIC         101 :             fmgr_info_cxt(utf8_to_server_proc, finfo,
     326 ECB             :                           TopMemoryContext);
     327                 :             /* Set Utf8ToServerConvProc only after data is fully valid */
     328 GIC         101 :             Utf8ToServerConvProc = finfo;
     329 ECB             :         }
     330                 :     }
     331 GIC       10866 : }
     332 ECB             : 
     333                 : /*
     334                 :  * returns the current client encoding
     335                 :  */
     336                 : int
     337 GIC        4449 : pg_get_client_encoding(void)
     338 ECB             : {
     339 GIC        4449 :     return ClientEncoding->encoding;
     340 ECB             : }
     341                 : 
     342                 : /*
     343                 :  * returns the current client encoding name
     344                 :  */
     345                 : const char *
     346 UIC           0 : pg_get_client_encoding_name(void)
     347 EUB             : {
     348 UIC           0 :     return ClientEncoding->name;
     349 EUB             : }
     350                 : 
     351                 : /*
     352                 :  * Convert src string to another encoding (general case).
     353                 :  *
     354                 :  * See the notes about string conversion functions at the top of this file.
     355                 :  */
     356                 : unsigned char *
     357 GIC        1417 : pg_do_encoding_conversion(unsigned char *src, int len,
     358 ECB             :                           int src_encoding, int dest_encoding)
     359                 : {
     360                 :     unsigned char *result;
     361                 :     Oid         proc;
     362                 : 
     363 GIC        1417 :     if (len <= 0)
     364 CBC          15 :         return src;             /* empty string is always valid */
     365 ECB             : 
     366 GIC        1402 :     if (src_encoding == dest_encoding)
     367 CBC         896 :         return src;             /* no conversion required, assume valid */
     368 ECB             : 
     369 GIC         506 :     if (dest_encoding == PG_SQL_ASCII)
     370 LBC           0 :         return src;             /* any string is valid in SQL_ASCII */
     371 EUB             : 
     372 GIC         506 :     if (src_encoding == PG_SQL_ASCII)
     373 ECB             :     {
     374                 :         /* No conversion is possible, but we must validate the result */
     375 GIC           8 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     376 CBC           8 :         return src;
     377 ECB             :     }
     378                 : 
     379 GIC         498 :     if (!IsTransactionState())  /* shouldn't happen */
     380 LBC           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     381 EUB             : 
     382 GIC         498 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     383 CBC         498 :     if (!OidIsValid(proc))
     384 LBC           0 :         ereport(ERROR,
     385 EUB             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     386                 :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     387                 :                         pg_encoding_to_char(src_encoding),
     388                 :                         pg_encoding_to_char(dest_encoding))));
     389                 : 
     390                 :     /*
     391                 :      * Allocate space for conversion result, being wary of integer overflow.
     392                 :      *
     393                 :      * len * MAX_CONVERSION_GROWTH is typically a vast overestimate of the
     394                 :      * required space, so it might exceed MaxAllocSize even though the result
     395                 :      * would actually fit.  We do not want to hand back a result string that
     396                 :      * exceeds MaxAllocSize, because callers might not cope gracefully --- but
     397                 :      * if we just allocate more than that, and don't use it, that's fine.
     398                 :      */
     399 GIC         498 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     400 LBC           0 :         ereport(ERROR,
     401 EUB             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     402                 :                  errmsg("out of memory"),
     403                 :                  errdetail("String of %d bytes is too long for encoding conversion.",
     404                 :                            len)));
     405                 : 
     406                 :     result = (unsigned char *)
     407 GIC         498 :         MemoryContextAllocHuge(CurrentMemoryContext,
     408 CBC         498 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     409 ECB             : 
     410 GIC         498 :     (void) OidFunctionCall6(proc,
     411 ECB             :                             Int32GetDatum(src_encoding),
     412                 :                             Int32GetDatum(dest_encoding),
     413                 :                             CStringGetDatum((char *) src),
     414                 :                             CStringGetDatum((char *) result),
     415                 :                             Int32GetDatum(len),
     416                 :                             BoolGetDatum(false));
     417                 : 
     418                 :     /*
     419                 :      * If the result is large, it's worth repalloc'ing to release any extra
     420                 :      * space we asked for.  The cutoff here is somewhat arbitrary, but we
     421                 :      * *must* check when len * MAX_CONVERSION_GROWTH exceeds MaxAllocSize.
     422                 :      */
     423 GIC         498 :     if (len > 1000000)
     424 ECB             :     {
     425 UIC           0 :         Size        resultlen = strlen((char *) result);
     426 EUB             : 
     427 UIC           0 :         if (resultlen >= MaxAllocSize)
     428 UBC           0 :             ereport(ERROR,
     429 EUB             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     430                 :                      errmsg("out of memory"),
     431                 :                      errdetail("String of %d bytes is too long for encoding conversion.",
     432                 :                                len)));
     433                 : 
     434 UIC           0 :         result = (unsigned char *) repalloc(result, resultlen + 1);
     435 EUB             :     }
     436                 : 
     437 GIC         498 :     return result;
     438 ECB             : }
     439                 : 
     440                 : /*
     441                 :  * Convert src string to another encoding.
     442                 :  *
     443                 :  * This function has a different API than the other conversion functions.
     444                 :  * The caller should've looked up the conversion function using
     445                 :  * FindDefaultConversionProc().  Unlike the other functions, the converted
     446                 :  * result is not palloc'd.  It is written to the caller-supplied buffer
     447                 :  * instead.
     448                 :  *
     449                 :  * src_encoding   - encoding to convert from
     450                 :  * dest_encoding  - encoding to convert to
     451                 :  * src, srclen    - input buffer and its length in bytes
     452                 :  * dest, destlen  - destination buffer and its size in bytes
     453                 :  *
     454                 :  * The output is null-terminated.
     455                 :  *
     456                 :  * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output
     457                 :  * wouldn't necessarily fit in the output buffer, and the function will not
     458                 :  * convert the whole input.
     459                 :  *
     460                 :  * TODO: The conversion function interface is not great.  Firstly, it
     461                 :  * would be nice to pass through the destination buffer size to the
     462                 :  * conversion function, so that if you pass a shorter destination buffer, it
     463                 :  * could still continue to fill up the whole buffer.  Currently, we have to
     464                 :  * assume worst case expansion and stop the conversion short, even if there
     465                 :  * is in fact space left in the destination buffer.  Secondly, it would be
     466                 :  * nice to return the number of bytes written to the caller, to avoid a call
     467                 :  * to strlen().
     468                 :  */
     469                 : int
     470 GIC        2853 : pg_do_encoding_conversion_buf(Oid proc,
     471 ECB             :                               int src_encoding,
     472                 :                               int dest_encoding,
     473                 :                               unsigned char *src, int srclen,
     474                 :                               unsigned char *dest, int destlen,
     475                 :                               bool noError)
     476                 : {
     477                 :     Datum       result;
     478                 : 
     479                 :     /*
     480                 :      * If the destination buffer is not large enough to hold the result in the
     481                 :      * worst case, limit the input size passed to the conversion function.
     482                 :      */
     483 GIC        2853 :     if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH))
     484 CBC        2853 :         srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH);
     485 ECB             : 
     486 GIC        2853 :     result = OidFunctionCall6(proc,
     487 ECB             :                               Int32GetDatum(src_encoding),
     488                 :                               Int32GetDatum(dest_encoding),
     489                 :                               CStringGetDatum((char *) src),
     490                 :                               CStringGetDatum((char *) dest),
     491                 :                               Int32GetDatum(srclen),
     492                 :                               BoolGetDatum(noError));
     493 GIC        1683 :     return DatumGetInt32(result);
     494 ECB             : }
     495                 : 
     496                 : /*
     497                 :  * Convert string to encoding encoding_name. The source
     498                 :  * encoding is the DB encoding.
     499                 :  *
     500                 :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     501                 : Datum
     502 GIC          24 : pg_convert_to(PG_FUNCTION_ARGS)
     503 ECB             : {
     504 GIC          24 :     Datum       string = PG_GETARG_DATUM(0);
     505 CBC          24 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     506              24 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     507 ECB             :                                                         CStringGetDatum(DatabaseEncoding->name));
     508                 :     Datum       result;
     509                 : 
     510                 :     /*
     511                 :      * pg_convert expects a bytea as its first argument. We're passing it a
     512                 :      * text argument here, relying on the fact that they are both in fact
     513                 :      * varlena types, and thus structurally identical.
     514                 :      */
     515 GIC          24 :     result = DirectFunctionCall3(pg_convert, string,
     516 ECB             :                                  src_encoding_name, dest_encoding_name);
     517                 : 
     518 GIC          24 :     PG_RETURN_DATUM(result);
     519 ECB             : }
     520                 : 
     521                 : /*
     522                 :  * Convert string from encoding encoding_name. The destination
     523                 :  * encoding is the DB encoding.
     524                 :  *
     525                 :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     526                 : Datum
     527 GIC         281 : pg_convert_from(PG_FUNCTION_ARGS)
     528 ECB             : {
     529 GIC         281 :     Datum       string = PG_GETARG_DATUM(0);
     530 CBC         281 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     531             281 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     532 ECB             :                                                          CStringGetDatum(DatabaseEncoding->name));
     533                 :     Datum       result;
     534                 : 
     535 GIC         281 :     result = DirectFunctionCall3(pg_convert, string,
     536 ECB             :                                  src_encoding_name, dest_encoding_name);
     537                 : 
     538                 :     /*
     539                 :      * pg_convert returns a bytea, which we in turn return as text, relying on
     540                 :      * the fact that they are both in fact varlena types, and thus
     541                 :      * structurally identical. Although not all bytea values are valid text,
     542                 :      * in this case it will be because we've told pg_convert to return one
     543                 :      * that is valid as text in the current database encoding.
     544                 :      */
     545 GIC         278 :     PG_RETURN_DATUM(result);
     546 ECB             : }
     547                 : 
     548                 : /*
     549                 :  * Convert string between two arbitrary encodings.
     550                 :  *
     551                 :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     552                 :  */
     553                 : Datum
     554 GIC         689 : pg_convert(PG_FUNCTION_ARGS)
     555 ECB             : {
     556 GIC         689 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     557 CBC         689 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     558             689 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     559             689 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     560             689 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     561 ECB             :     const char *src_str;
     562                 :     char       *dest_str;
     563                 :     bytea      *retval;
     564                 :     int         len;
     565                 : 
     566 GIC         689 :     if (src_encoding < 0)
     567 LBC           0 :         ereport(ERROR,
     568 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     569                 :                  errmsg("invalid source encoding name \"%s\"",
     570                 :                         src_encoding_name)));
     571 GIC         689 :     if (dest_encoding < 0)
     572 LBC           0 :         ereport(ERROR,
     573 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     574                 :                  errmsg("invalid destination encoding name \"%s\"",
     575                 :                         dest_encoding_name)));
     576                 : 
     577                 :     /* make sure that source string is valid */
     578 GIC         689 :     len = VARSIZE_ANY_EXHDR(string);
     579 CBC         689 :     src_str = VARDATA_ANY(string);
     580             689 :     (void) pg_verify_mbstr(src_encoding, src_str, len, false);
     581 ECB             : 
     582                 :     /* perform conversion */
     583 GIC         686 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
     584 ECB             :                                                   len,
     585                 :                                                   src_encoding,
     586                 :                                                   dest_encoding);
     587                 : 
     588                 :     /* update len if conversion actually happened */
     589 GIC         686 :     if (dest_str != src_str)
     590 CBC         384 :         len = strlen(dest_str);
     591 ECB             : 
     592                 :     /*
     593                 :      * build bytea data type structure.
     594                 :      */
     595 GIC         686 :     retval = (bytea *) palloc(len + VARHDRSZ);
     596 CBC         686 :     SET_VARSIZE(retval, len + VARHDRSZ);
     597             686 :     memcpy(VARDATA(retval), dest_str, len);
     598 ECB             : 
     599 GIC         686 :     if (dest_str != src_str)
     600 CBC         384 :         pfree(dest_str);
     601 ECB             : 
     602                 :     /* free memory if allocated by the toaster */
     603 GIC         686 :     PG_FREE_IF_COPY(string, 0);
     604 ECB             : 
     605 GIC         686 :     PG_RETURN_BYTEA_P(retval);
     606 ECB             : }
     607                 : 
     608                 : /*
     609                 :  * get the length of the string considered as text in the specified
     610                 :  * encoding. Raises an error if the data is not valid in that
     611                 :  * encoding.
     612                 :  *
     613                 :  * INT4 length (BYTEA string, NAME src_encoding_name)
     614                 :  */
     615                 : Datum
     616 UIC           0 : length_in_encoding(PG_FUNCTION_ARGS)
     617 EUB             : {
     618 UIC           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     619 UBC           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     620               0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     621 EUB             :     const char *src_str;
     622                 :     int         len;
     623                 :     int         retval;
     624                 : 
     625 UIC           0 :     if (src_encoding < 0)
     626 UBC           0 :         ereport(ERROR,
     627 EUB             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     628                 :                  errmsg("invalid encoding name \"%s\"",
     629                 :                         src_encoding_name)));
     630                 : 
     631 UIC           0 :     len = VARSIZE_ANY_EXHDR(string);
     632 UBC           0 :     src_str = VARDATA_ANY(string);
     633 EUB             : 
     634 UIC           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     635 EUB             : 
     636 UIC           0 :     PG_RETURN_INT32(retval);
     637 EUB             : }
     638                 : 
     639                 : /*
     640                 :  * Get maximum multibyte character length in the specified encoding.
     641                 :  *
     642                 :  * Note encoding is specified numerically, not by name as above.
     643                 :  */
     644                 : Datum
     645 UIC           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     646 EUB             : {
     647 UIC           0 :     int         encoding = PG_GETARG_INT32(0);
     648 EUB             : 
     649 UIC           0 :     if (PG_VALID_ENCODING(encoding))
     650 UBC           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     651 EUB             :     else
     652 UIC           0 :         PG_RETURN_NULL();
     653 EUB             : }
     654                 : 
     655                 : /*
     656                 :  * Convert client encoding to server encoding.
     657                 :  *
     658                 :  * See the notes about string conversion functions at the top of this file.
     659                 :  */
     660                 : char *
     661 GIC      629604 : pg_client_to_server(const char *s, int len)
     662 ECB             : {
     663 GIC      629604 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     664 ECB             : }
     665                 : 
     666                 : /*
     667                 :  * Convert any encoding to server encoding.
     668                 :  *
     669                 :  * See the notes about string conversion functions at the top of this file.
     670                 :  *
     671                 :  * Unlike the other string conversion functions, this will apply validation
     672                 :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     673                 :  * used to process data coming in from outside the database, and we never
     674                 :  * want to just assume validity.
     675                 :  */
     676                 : char *
     677 GIC      668076 : pg_any_to_server(const char *s, int len, int encoding)
     678 ECB             : {
     679 GIC      668076 :     if (len <= 0)
     680 CBC       36008 :         return unconstify(char *, s);   /* empty string is always valid */
     681 ECB             : 
     682 GIC      632068 :     if (encoding == DatabaseEncoding->encoding ||
     683 ECB             :         encoding == PG_SQL_ASCII)
     684                 :     {
     685                 :         /*
     686                 :          * No conversion is needed, but we must still validate the data.
     687                 :          */
     688 GIC      631910 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     689 CBC      631909 :         return unconstify(char *, s);
     690 ECB             :     }
     691                 : 
     692 GIC         158 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     693 ECB             :     {
     694                 :         /*
     695                 :          * No conversion is possible, but we must still validate the data,
     696                 :          * because the client-side code might have done string escaping using
     697                 :          * the selected client_encoding.  If the client encoding is ASCII-safe
     698                 :          * then we just do a straight validation under that encoding.  For an
     699                 :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     700                 :          * to the parser but we have no way to convert it.  We compromise by
     701                 :          * rejecting the data if it contains any non-ASCII characters.
     702                 :          */
     703 GIC          26 :         if (PG_VALID_BE_ENCODING(encoding))
     704 CBC          26 :             (void) pg_verify_mbstr(encoding, s, len, false);
     705 ECB             :         else
     706                 :         {
     707                 :             int         i;
     708                 : 
     709 UIC           0 :             for (i = 0; i < len; i++)
     710 EUB             :             {
     711 UIC           0 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     712 UBC           0 :                     ereport(ERROR,
     713 EUB             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     714                 :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     715                 :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     716                 :                                     (unsigned char) s[i])));
     717                 :             }
     718                 :         }
     719 GIC          26 :         return unconstify(char *, s);
     720 ECB             :     }
     721                 : 
     722                 :     /* Fast path if we can use cached conversion function */
     723 GIC         132 :     if (encoding == ClientEncoding->encoding)
     724 CBC          18 :         return perform_default_encoding_conversion(s, len, true);
     725 ECB             : 
     726                 :     /* General case ... will not work outside transactions */
     727 GIC         114 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     728 ECB             :                                               len,
     729                 :                                               encoding,
     730 GIC         114 :                                               DatabaseEncoding->encoding);
     731 ECB             : }
     732                 : 
     733                 : /*
     734                 :  * Convert server encoding to client encoding.
     735                 :  *
     736                 :  * See the notes about string conversion functions at the top of this file.
     737                 :  */
     738                 : char *
     739 GIC    14303212 : pg_server_to_client(const char *s, int len)
     740 ECB             : {
     741 GIC    14303212 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     742 ECB             : }
     743                 : 
     744                 : /*
     745                 :  * Convert server encoding to any encoding.
     746                 :  *
     747                 :  * See the notes about string conversion functions at the top of this file.
     748                 :  */
     749                 : char *
     750 GIC    19099607 : pg_server_to_any(const char *s, int len, int encoding)
     751 ECB             : {
     752 GIC    19099607 :     if (len <= 0)
     753 CBC       93680 :         return unconstify(char *, s);   /* empty string is always valid */
     754 ECB             : 
     755 GIC    19005927 :     if (encoding == DatabaseEncoding->encoding ||
     756 ECB             :         encoding == PG_SQL_ASCII)
     757 GIC    19005777 :         return unconstify(char *, s);   /* assume data is valid */
     758 ECB             : 
     759 GIC         150 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     760 ECB             :     {
     761                 :         /* No conversion is possible, but we must validate the result */
     762 GIC           6 :         (void) pg_verify_mbstr(encoding, s, len, false);
     763 CBC           6 :         return unconstify(char *, s);
     764 ECB             :     }
     765                 : 
     766                 :     /* Fast path if we can use cached conversion function */
     767 GIC         144 :     if (encoding == ClientEncoding->encoding)
     768 CBC         144 :         return perform_default_encoding_conversion(s, len, false);
     769 ECB             : 
     770                 :     /* General case ... will not work outside transactions */
     771 UIC           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, s),
     772 EUB             :                                               len,
     773 UIC           0 :                                               DatabaseEncoding->encoding,
     774 EUB             :                                               encoding);
     775                 : }
     776                 : 
     777                 : /*
     778                 :  *  Perform default encoding conversion using cached FmgrInfo. Since
     779                 :  *  this function does not access database at all, it is safe to call
     780                 :  *  outside transactions.  If the conversion has not been set up by
     781                 :  *  SetClientEncoding(), no conversion is performed.
     782                 :  */
     783                 : static char *
     784 GIC         162 : perform_default_encoding_conversion(const char *src, int len,
     785 ECB             :                                     bool is_client_to_server)
     786                 : {
     787                 :     char       *result;
     788                 :     int         src_encoding,
     789                 :                 dest_encoding;
     790                 :     FmgrInfo   *flinfo;
     791                 : 
     792 GIC         162 :     if (is_client_to_server)
     793 ECB             :     {
     794 GIC          18 :         src_encoding = ClientEncoding->encoding;
     795 CBC          18 :         dest_encoding = DatabaseEncoding->encoding;
     796              18 :         flinfo = ToServerConvProc;
     797 ECB             :     }
     798                 :     else
     799                 :     {
     800 GIC         144 :         src_encoding = DatabaseEncoding->encoding;
     801 CBC         144 :         dest_encoding = ClientEncoding->encoding;
     802             144 :         flinfo = ToClientConvProc;
     803 ECB             :     }
     804                 : 
     805 GIC         162 :     if (flinfo == NULL)
     806 LBC           0 :         return unconstify(char *, src);
     807 EUB             : 
     808                 :     /*
     809                 :      * Allocate space for conversion result, being wary of integer overflow.
     810                 :      * See comments in pg_do_encoding_conversion.
     811                 :      */
     812 GIC         162 :     if ((Size) len >= (MaxAllocHugeSize / (Size) MAX_CONVERSION_GROWTH))
     813 LBC           0 :         ereport(ERROR,
     814 EUB             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     815                 :                  errmsg("out of memory"),
     816                 :                  errdetail("String of %d bytes is too long for encoding conversion.",
     817                 :                            len)));
     818                 : 
     819                 :     result = (char *)
     820 GIC         162 :         MemoryContextAllocHuge(CurrentMemoryContext,
     821 CBC         162 :                                (Size) len * MAX_CONVERSION_GROWTH + 1);
     822 ECB             : 
     823 GIC         162 :     FunctionCall6(flinfo,
     824 ECB             :                   Int32GetDatum(src_encoding),
     825                 :                   Int32GetDatum(dest_encoding),
     826                 :                   CStringGetDatum(src),
     827                 :                   CStringGetDatum(result),
     828                 :                   Int32GetDatum(len),
     829                 :                   BoolGetDatum(false));
     830                 : 
     831                 :     /*
     832                 :      * Release extra space if there might be a lot --- see comments in
     833                 :      * pg_do_encoding_conversion.
     834                 :      */
     835 GIC         162 :     if (len > 1000000)
     836 ECB             :     {
     837 UIC           0 :         Size        resultlen = strlen(result);
     838 EUB             : 
     839 UIC           0 :         if (resultlen >= MaxAllocSize)
     840 UBC           0 :             ereport(ERROR,
     841 EUB             :                     (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     842                 :                      errmsg("out of memory"),
     843                 :                      errdetail("String of %d bytes is too long for encoding conversion.",
     844                 :                                len)));
     845                 : 
     846 UIC           0 :         result = (char *) repalloc(result, resultlen + 1);
     847 EUB             :     }
     848                 : 
     849 GIC         162 :     return result;
     850 ECB             : }
     851                 : 
     852                 : /*
     853                 :  * Convert a single Unicode code point into a string in the server encoding.
     854                 :  *
     855                 :  * The code point given by "c" is converted and stored at *s, which must
     856                 :  * have at least MAX_UNICODE_EQUIVALENT_STRING+1 bytes available.
     857                 :  * The output will have a trailing '\0'.  Throws error if the conversion
     858                 :  * cannot be performed.
     859                 :  *
     860                 :  * Note that this relies on having previously looked up any required
     861                 :  * conversion function.  That's partly for speed but mostly because the parser
     862                 :  * may call this outside any transaction, or in an aborted transaction.
     863                 :  */
     864                 : void
     865 GIC         300 : pg_unicode_to_server(pg_wchar c, unsigned char *s)
     866 ECB             : {
     867                 :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     868                 :     int         c_as_utf8_len;
     869                 :     int         server_encoding;
     870                 : 
     871                 :     /*
     872                 :      * Complain if invalid Unicode code point.  The choice of errcode here is
     873                 :      * debatable, but really our caller should have checked this anyway.
     874                 :      */
     875 GIC         300 :     if (!is_valid_unicode_codepoint(c))
     876 LBC           0 :         ereport(ERROR,
     877 EUB             :                 (errcode(ERRCODE_SYNTAX_ERROR),
     878                 :                  errmsg("invalid Unicode code point")));
     879                 : 
     880                 :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     881 GIC         300 :     if (c <= 0x7F)
     882 ECB             :     {
     883 GIC         117 :         s[0] = (unsigned char) c;
     884 CBC         117 :         s[1] = '\0';
     885             300 :         return;
     886 ECB             :     }
     887                 : 
     888                 :     /* If the server encoding is UTF-8, we just need to reformat the code */
     889 GIC         183 :     server_encoding = GetDatabaseEncoding();
     890 CBC         183 :     if (server_encoding == PG_UTF8)
     891 ECB             :     {
     892 GIC         183 :         unicode_to_utf8(c, s);
     893 CBC         183 :         s[pg_utf_mblen(s)] = '\0';
     894             183 :         return;
     895 ECB             :     }
     896                 : 
     897                 :     /* For all other cases, we must have a conversion function available */
     898 UIC           0 :     if (Utf8ToServerConvProc == NULL)
     899 UBC           0 :         ereport(ERROR,
     900 EUB             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     901                 :                  errmsg("conversion between %s and %s is not supported",
     902                 :                         pg_enc2name_tbl[PG_UTF8].name,
     903                 :                         GetDatabaseEncodingName())));
     904                 : 
     905                 :     /* Construct UTF-8 source string */
     906 UIC           0 :     unicode_to_utf8(c, c_as_utf8);
     907 UBC           0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     908               0 :     c_as_utf8[c_as_utf8_len] = '\0';
     909 EUB             : 
     910                 :     /* Convert, or throw error if we can't */
     911 UIC           0 :     FunctionCall6(Utf8ToServerConvProc,
     912 EUB             :                   Int32GetDatum(PG_UTF8),
     913                 :                   Int32GetDatum(server_encoding),
     914                 :                   CStringGetDatum((char *) c_as_utf8),
     915                 :                   CStringGetDatum((char *) s),
     916                 :                   Int32GetDatum(c_as_utf8_len),
     917                 :                   BoolGetDatum(false));
     918                 : }
     919                 : 
     920                 : /*
     921                 :  * Convert a single Unicode code point into a string in the server encoding.
     922                 :  *
     923                 :  * Same as pg_unicode_to_server(), except that we don't throw errors,
     924                 :  * but simply return false on conversion failure.
     925                 :  */
     926                 : bool
     927 GNC          42 : pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
     928                 : {
     929                 :     unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
     930                 :     int         c_as_utf8_len;
     931                 :     int         converted_len;
     932                 :     int         server_encoding;
     933                 : 
     934                 :     /* Fail if invalid Unicode code point */
     935              42 :     if (!is_valid_unicode_codepoint(c))
     936 UNC           0 :         return false;
     937                 : 
     938                 :     /* Otherwise, if it's in ASCII range, conversion is trivial */
     939 GNC          42 :     if (c <= 0x7F)
     940                 :     {
     941              12 :         s[0] = (unsigned char) c;
     942              12 :         s[1] = '\0';
     943              12 :         return true;
     944                 :     }
     945                 : 
     946                 :     /* If the server encoding is UTF-8, we just need to reformat the code */
     947              30 :     server_encoding = GetDatabaseEncoding();
     948              30 :     if (server_encoding == PG_UTF8)
     949                 :     {
     950              30 :         unicode_to_utf8(c, s);
     951              30 :         s[pg_utf_mblen(s)] = '\0';
     952              30 :         return true;
     953                 :     }
     954                 : 
     955                 :     /* For all other cases, we must have a conversion function available */
     956 UNC           0 :     if (Utf8ToServerConvProc == NULL)
     957               0 :         return false;
     958                 : 
     959                 :     /* Construct UTF-8 source string */
     960               0 :     unicode_to_utf8(c, c_as_utf8);
     961               0 :     c_as_utf8_len = pg_utf_mblen(c_as_utf8);
     962               0 :     c_as_utf8[c_as_utf8_len] = '\0';
     963                 : 
     964                 :     /* Convert, but without throwing error if we can't */
     965               0 :     converted_len = DatumGetInt32(FunctionCall6(Utf8ToServerConvProc,
     966                 :                                                 Int32GetDatum(PG_UTF8),
     967                 :                                                 Int32GetDatum(server_encoding),
     968                 :                                                 CStringGetDatum((char *) c_as_utf8),
     969                 :                                                 CStringGetDatum((char *) s),
     970                 :                                                 Int32GetDatum(c_as_utf8_len),
     971                 :                                                 BoolGetDatum(true)));
     972                 : 
     973                 :     /* Conversion was successful iff it consumed the whole input */
     974               0 :     return (converted_len == c_as_utf8_len);
     975                 : }
     976                 : 
     977                 : 
     978                 : /* convert a multibyte string to a wchar */
     979                 : int
     980 UIC           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     981                 : {
     982               0 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, strlen(from));
     983                 : }
     984                 : 
     985 ECB             : /* convert a multibyte string to a wchar with a limited length */
     986                 : int
     987 GIC      458799 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     988                 : {
     989          458799 :     return pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     990                 : }
     991                 : 
     992                 : /* same, with any encoding */
     993 ECB             : int
     994 GBC        9140 : pg_encoding_mb2wchar_with_len(int encoding,
     995                 :                               const char *from, pg_wchar *to, int len)
     996                 : {
     997 CBC        9140 :     return pg_wchar_table[encoding].mb2wchar_with_len((const unsigned char *) from, to, len);
     998                 : }
     999 ECB             : 
    1000                 : /* convert a wchar string to a multibyte */
    1001                 : int
    1002 UIC           0 : pg_wchar2mb(const pg_wchar *from, char *to)
    1003                 : {
    1004               0 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, pg_wchar_strlen(from));
    1005 ECB             : }
    1006                 : 
    1007                 : /* convert a wchar string to a multibyte with a limited length */
    1008                 : int
    1009 CBC      555741 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
    1010 ECB             : {
    1011 GIC      555741 :     return pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1012                 : }
    1013                 : 
    1014 EUB             : /* same, with any encoding */
    1015                 : int
    1016 UIC           0 : pg_encoding_wchar2mb_with_len(int encoding,
    1017                 :                               const pg_wchar *from, char *to, int len)
    1018 EUB             : {
    1019 UBC           0 :     return pg_wchar_table[encoding].wchar2mb_with_len(from, (unsigned char *) to, len);
    1020 EUB             : }
    1021                 : 
    1022                 : /* returns the byte length of a multibyte character */
    1023                 : int
    1024 GIC   111073808 : pg_mblen(const char *mbstr)
    1025                 : {
    1026       111073808 :     return pg_wchar_table[DatabaseEncoding->encoding].mblen((const unsigned char *) mbstr);
    1027                 : }
    1028                 : 
    1029                 : /* returns the display length of a multibyte character */
    1030                 : int
    1031            4362 : pg_dsplen(const char *mbstr)
    1032 EUB             : {
    1033 GIC        4362 :     return pg_wchar_table[DatabaseEncoding->encoding].dsplen((const unsigned char *) mbstr);
    1034                 : }
    1035                 : 
    1036                 : /* returns the length (counted in wchars) of a multibyte string */
    1037                 : int
    1038 GBC         351 : pg_mbstrlen(const char *mbstr)
    1039                 : {
    1040             351 :     int         len = 0;
    1041                 : 
    1042                 :     /* optimization for single byte encoding */
    1043 GIC         351 :     if (pg_database_encoding_max_length() == 1)
    1044 UIC           0 :         return strlen(mbstr);
    1045 ECB             : 
    1046 GIC         813 :     while (*mbstr)
    1047 ECB             :     {
    1048 GIC         462 :         mbstr += pg_mblen(mbstr);
    1049             462 :         len++;
    1050                 :     }
    1051             351 :     return len;
    1052 ECB             : }
    1053                 : 
    1054                 : /* returns the length (counted in wchars) of a multibyte string
    1055                 :  * (not necessarily NULL terminated)
    1056                 :  */
    1057                 : int
    1058 GIC      847091 : pg_mbstrlen_with_len(const char *mbstr, int limit)
    1059                 : {
    1060 GBC      847091 :     int         len = 0;
    1061                 : 
    1062 EUB             :     /* optimization for single byte encoding */
    1063 GIC      847091 :     if (pg_database_encoding_max_length() == 1)
    1064 UIC           0 :         return limit;
    1065                 : 
    1066 GIC    95754570 :     while (limit > 0 && *mbstr)
    1067 ECB             :     {
    1068 GIC    94907479 :         int         l = pg_mblen(mbstr);
    1069 ECB             : 
    1070 GIC    94907479 :         limit -= l;
    1071        94907479 :         mbstr += l;
    1072        94907479 :         len++;
    1073                 :     }
    1074 GBC      847091 :     return len;
    1075                 : }
    1076                 : 
    1077 EUB             : /*
    1078                 :  * returns the byte length of a multibyte string
    1079                 :  * (not necessarily NULL terminated)
    1080                 :  * that is no longer than limit.
    1081                 :  * this function does not break multibyte character boundary.
    1082 ECB             :  */
    1083                 : int
    1084 CBC      214148 : pg_mbcliplen(const char *mbstr, int len, int limit)
    1085                 : {
    1086 GIC      214148 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
    1087                 :                                  len, limit);
    1088                 : }
    1089 ECB             : 
    1090                 : /*
    1091                 :  * pg_mbcliplen with specified encoding
    1092                 :  */
    1093                 : int
    1094 GIC      214148 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
    1095                 :                       int len, int limit)
    1096 ECB             : {
    1097                 :     mblen_converter mblen_fn;
    1098 CBC      214148 :     int         clen = 0;
    1099                 :     int         l;
    1100                 : 
    1101 ECB             :     /* optimization for single byte encoding */
    1102 GBC      214148 :     if (pg_encoding_max_length(encoding) == 1)
    1103 GIC       41840 :         return cliplen(mbstr, len, limit);
    1104 ECB             : 
    1105 GIC      172308 :     mblen_fn = pg_wchar_table[encoding].mblen;
    1106 ECB             : 
    1107 CBC     1693040 :     while (len > 0 && *mbstr)
    1108                 :     {
    1109         1604605 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
    1110 GIC     1604605 :         if ((clen + l) > limit)
    1111              41 :             break;
    1112         1604564 :         clen += l;
    1113         1604564 :         if (clen == limit)
    1114           83832 :             break;
    1115         1520732 :         len -= l;
    1116 CBC     1520732 :         mbstr += l;
    1117                 :     }
    1118          172308 :     return clen;
    1119                 : }
    1120                 : 
    1121 ECB             : /*
    1122 EUB             :  * Similar to pg_mbcliplen except the limit parameter specifies the
    1123                 :  * character length, not the byte length.
    1124 ECB             :  */
    1125                 : int
    1126 CBC         144 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
    1127                 : {
    1128             144 :     int         clen = 0;
    1129             144 :     int         nch = 0;
    1130 ECB             :     int         l;
    1131                 : 
    1132                 :     /* optimization for single byte encoding */
    1133 GIC         144 :     if (pg_database_encoding_max_length() == 1)
    1134 UIC           0 :         return cliplen(mbstr, len, limit);
    1135                 : 
    1136 GIC         681 :     while (len > 0 && *mbstr)
    1137                 :     {
    1138             672 :         l = pg_mblen(mbstr);
    1139             672 :         nch++;
    1140             672 :         if (nch > limit)
    1141             135 :             break;
    1142 CBC         537 :         clen += l;
    1143 GIC         537 :         len -= l;
    1144 CBC         537 :         mbstr += l;
    1145                 :     }
    1146 GIC         144 :     return clen;
    1147                 : }
    1148                 : 
    1149                 : /* mbcliplen for any single-byte encoding */
    1150                 : static int
    1151           41840 : cliplen(const char *str, int len, int limit)
    1152 ECB             : {
    1153 GIC       41840 :     int         l = 0;
    1154                 : 
    1155           41840 :     len = Min(len, limit);
    1156 CBC      314961 :     while (l < len && str[l])
    1157 GIC      273121 :         l++;
    1158           41840 :     return l;
    1159                 : }
    1160 ECB             : 
    1161                 : void
    1162 GIC       10213 : SetDatabaseEncoding(int encoding)
    1163 ECB             : {
    1164 GIC       10213 :     if (!PG_VALID_BE_ENCODING(encoding))
    1165 LBC           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
    1166                 : 
    1167 CBC       10213 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
    1168           10213 :     Assert(DatabaseEncoding->encoding == encoding);
    1169           10213 : }
    1170 ECB             : 
    1171                 : void
    1172 CBC       12832 : SetMessageEncoding(int encoding)
    1173 ECB             : {
    1174                 :     /* Some calls happen before we can elog()! */
    1175 GIC       12832 :     Assert(PG_VALID_ENCODING(encoding));
    1176 ECB             : 
    1177 GIC       12832 :     MessageEncoding = &pg_enc2name_tbl[encoding];
    1178           12832 :     Assert(MessageEncoding->encoding == encoding);
    1179           12832 : }
    1180                 : 
    1181                 : #ifdef ENABLE_NLS
    1182                 : /*
    1183                 :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
    1184 ECB             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
    1185                 :  * fail for gettext-internal causes like out-of-memory.
    1186                 :  */
    1187                 : static bool
    1188 GIC        1743 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
    1189                 : {
    1190            1743 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1191 ECB             :     int         i;
    1192 EUB             : 
    1193 GIC        3592 :     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
    1194 ECB             :     {
    1195 GIC        3592 :         if (pg_enc2gettext_tbl[i].encoding == encoding)
    1196 ECB             :         {
    1197 CBC        1743 :             if (bind_textdomain_codeset(domainname,
    1198            1743 :                                         pg_enc2gettext_tbl[i].name) != NULL)
    1199            1743 :                 return true;
    1200 ECB             : 
    1201 LBC           0 :             if (elog_ok)
    1202               0 :                 elog(LOG, "bind_textdomain_codeset failed");
    1203                 :             else
    1204               0 :                 write_stderr("bind_textdomain_codeset failed");
    1205                 : 
    1206 UIC           0 :             break;
    1207                 :         }
    1208                 :     }
    1209 ECB             : 
    1210 UIC           0 :     return false;
    1211 ECB             : }
    1212                 : 
    1213                 : /*
    1214                 :  * Bind a gettext message domain to the codeset corresponding to the database
    1215                 :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
    1216                 :  * Return the MessageEncoding implied by the new settings.
    1217                 :  *
    1218                 :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
    1219                 :  * When that matches the database encoding, we don't need to do anything.  In
    1220                 :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
    1221                 :  * database encoding, except for the C locale.  (On Windows, we also permit a
    1222                 :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
    1223 EUB             :  * gettext to the right codeset.
    1224                 :  *
    1225 ECB             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
    1226                 :  * convenient departure for software that passes the strings to Windows ANSI
    1227                 :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
    1228                 :  * failing that, the LC_CTYPE encoding as it would on other platforms.
    1229                 :  *
    1230                 :  * This function is called before elog() and palloc() are usable.
    1231                 :  */
    1232                 : int
    1233 CBC       14655 : pg_bind_textdomain_codeset(const char *domainname)
    1234                 : {
    1235           14655 :     bool        elog_ok = (CurrentMemoryContext != NULL);
    1236           14655 :     int         encoding = GetDatabaseEncoding();
    1237 ECB             :     int         new_msgenc;
    1238                 : 
    1239                 : #ifndef WIN32
    1240 GIC       14655 :     const char *ctype = setlocale(LC_CTYPE, NULL);
    1241                 : 
    1242           14655 :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
    1243                 : #endif
    1244            3532 :         if (encoding != PG_SQL_ASCII &&
    1245            1743 :             raw_pg_bind_textdomain_codeset(domainname, encoding))
    1246 CBC        1743 :             return encoding;
    1247                 : 
    1248           12912 :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
    1249 GIC       12912 :     if (new_msgenc < 0)
    1250 UIC           0 :         new_msgenc = PG_SQL_ASCII;
    1251 ECB             : 
    1252                 : #ifdef WIN32
    1253                 :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1254                 :         /* On failure, the old message encoding remains valid. */
    1255                 :         return GetMessageEncoding();
    1256                 : #endif
    1257                 : 
    1258 GIC       12912 :     return new_msgenc;
    1259 EUB             : }
    1260                 : #endif
    1261                 : 
    1262                 : /*
    1263                 :  * The database encoding, also called the server encoding, represents the
    1264                 :  * encoding of data stored in text-like data types.  Affected types include
    1265                 :  * cstring, text, varchar, name, xml, and json.
    1266                 :  */
    1267                 : int
    1268 GBC    23467668 : GetDatabaseEncoding(void)
    1269                 : {
    1270 GIC    23467668 :     return DatabaseEncoding->encoding;
    1271                 : }
    1272                 : 
    1273                 : const char *
    1274           21313 : GetDatabaseEncodingName(void)
    1275                 : {
    1276           21313 :     return DatabaseEncoding->name;
    1277                 : }
    1278                 : 
    1279                 : Datum
    1280              37 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1281                 : {
    1282              37 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1283                 : }
    1284                 : 
    1285                 : Datum
    1286 UIC           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1287                 : {
    1288               0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1289                 : }
    1290                 : 
    1291 ECB             : Datum
    1292 GIC          18 : PG_char_to_encoding(PG_FUNCTION_ARGS)
    1293 ECB             : {
    1294 CBC          18 :     Name        s = PG_GETARG_NAME(0);
    1295                 : 
    1296 GIC          18 :     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
    1297                 : }
    1298 ECB             : 
    1299                 : Datum
    1300 CBC        1658 : PG_encoding_to_char(PG_FUNCTION_ARGS)
    1301                 : {
    1302            1658 :     int32       encoding = PG_GETARG_INT32(0);
    1303            1658 :     const char *encoding_name = pg_encoding_to_char(encoding);
    1304 ECB             : 
    1305 GIC        1658 :     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
    1306 ECB             : }
    1307                 : 
    1308 EUB             : /*
    1309                 :  * gettext() returns messages in this encoding.  This often matches the
    1310                 :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1311                 :  * not attached to a database, and under a database encoding lacking iconv
    1312                 :  * support (MULE_INTERNAL).
    1313                 :  */
    1314                 : int
    1315 UIC           0 : GetMessageEncoding(void)
    1316 ECB             : {
    1317 UIC           0 :     return MessageEncoding->encoding;
    1318                 : }
    1319                 : 
    1320                 : 
    1321                 : /*
    1322                 :  * Generic character incrementer function.
    1323                 :  *
    1324                 :  * Not knowing anything about the properties of the encoding in use, we just
    1325                 :  * keep incrementing the last byte until we get a validly-encoded result,
    1326 ECB             :  * or we run out of values to try.  We don't bother to try incrementing
    1327                 :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1328                 :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1329                 :  * is not a valid final byte in the encoding.)
    1330                 :  */
    1331                 : static bool
    1332 CBC          51 : pg_generic_charinc(unsigned char *charptr, int len)
    1333                 : {
    1334              51 :     unsigned char *lastbyte = charptr + len - 1;
    1335                 :     mbchar_verifier mbverify;
    1336                 : 
    1337                 :     /* We can just invoke the character verifier directly. */
    1338              51 :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;
    1339                 : 
    1340              51 :     while (*lastbyte < (unsigned char) 255)
    1341                 :     {
    1342 GIC          51 :         (*lastbyte)++;
    1343              51 :         if ((*mbverify) (charptr, len) == len)
    1344 GBC          51 :             return true;
    1345                 :     }
    1346 EUB             : 
    1347 UIC           0 :     return false;
    1348                 : }
    1349                 : 
    1350 ECB             : /*
    1351                 :  * UTF-8 character incrementer function.
    1352                 :  *
    1353                 :  * For a one-byte character less than 0x7F, we just increment the byte.
    1354                 :  *
    1355                 :  * For a multibyte character, every byte but the first must fall between 0x80
    1356                 :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1357                 :  * the last byte that's not already at its maximum value.  If we can't find a
    1358                 :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1359                 :  * need some special-case logic to skip regions used for surrogate pair
    1360                 :  * handling, as those should not occur in valid UTF-8.
    1361                 :  *
    1362                 :  * Note that we don't reset lower-order bytes back to their minimums, since
    1363                 :  * we can't afford to make an exhaustive search (see make_greater_string).
    1364                 :  */
    1365                 : static bool
    1366 GIC         929 : pg_utf8_increment(unsigned char *charptr, int length)
    1367                 : {
    1368                 :     unsigned char a;
    1369                 :     unsigned char limit;
    1370                 : 
    1371             929 :     switch (length)
    1372                 :     {
    1373 UBC           0 :         default:
    1374                 :             /* reject lengths 5 and 6 for now */
    1375               0 :             return false;
    1376 UIC           0 :         case 4:
    1377               0 :             a = charptr[3];
    1378               0 :             if (a < 0xBF)
    1379                 :             {
    1380               0 :                 charptr[3]++;
    1381               0 :                 break;
    1382                 :             }
    1383                 :             /* FALL THRU */
    1384                 :         case 3:
    1385               0 :             a = charptr[2];
    1386               0 :             if (a < 0xBF)
    1387                 :             {
    1388               0 :                 charptr[2]++;
    1389               0 :                 break;
    1390 ECB             :             }
    1391                 :             /* FALL THRU */
    1392                 :         case 2:
    1393 UIC           0 :             a = charptr[1];
    1394               0 :             switch (*charptr)
    1395                 :             {
    1396 LBC           0 :                 case 0xED:
    1397 UIC           0 :                     limit = 0x9F;
    1398 LBC           0 :                     break;
    1399 UIC           0 :                 case 0xF4:
    1400 LBC           0 :                     limit = 0x8F;
    1401               0 :                     break;
    1402               0 :                 default:
    1403 UIC           0 :                     limit = 0xBF;
    1404               0 :                     break;
    1405 EUB             :             }
    1406 UIC           0 :             if (a < limit)
    1407                 :             {
    1408               0 :                 charptr[1]++;
    1409               0 :                 break;
    1410                 :             }
    1411                 :             /* FALL THRU */
    1412                 :         case 1:
    1413 GIC         929 :             a = *charptr;
    1414             929 :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1415 UIC           0 :                 return false;
    1416 GIC         929 :             charptr[0]++;
    1417             929 :             break;
    1418                 :     }
    1419                 : 
    1420             929 :     return true;
    1421                 : }
    1422                 : 
    1423                 : /*
    1424 ECB             :  * EUC-JP character incrementer function.
    1425                 :  *
    1426                 :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1427                 :  * representing JIS X 0201 characters with the second byte ranging between
    1428                 :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1429                 :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1430                 :  *
    1431 EUB             :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1432                 :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1433                 :  * is incremented if possible, otherwise the second-to-last byte.
    1434                 :  *
    1435                 :  * If the sequence starts with a value other than the above and its MSB
    1436                 :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1437                 :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1438                 :  * incremented if possible, otherwise the second-to-last byte.
    1439                 :  *
    1440                 :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1441                 :  * incremented up to 0x7f.
    1442                 :  */
    1443                 : static bool
    1444 UBC           0 : pg_eucjp_increment(unsigned char *charptr, int length)
    1445                 : {
    1446 EUB             :     unsigned char c1,
    1447                 :                 c2;
    1448                 :     int         i;
    1449                 : 
    1450 UIC           0 :     c1 = *charptr;
    1451 EUB             : 
    1452 UBC           0 :     switch (c1)
    1453                 :     {
    1454               0 :         case SS2:               /* JIS X 0201 */
    1455               0 :             if (length != 2)
    1456               0 :                 return false;
    1457 EUB             : 
    1458 UBC           0 :             c2 = charptr[1];
    1459 EUB             : 
    1460 UBC           0 :             if (c2 >= 0xdf)
    1461               0 :                 charptr[0] = charptr[1] = 0xa1;
    1462               0 :             else if (c2 < 0xa1)
    1463 UIC           0 :                 charptr[1] = 0xa1;
    1464 EUB             :             else
    1465 UIC           0 :                 charptr[1]++;
    1466 UBC           0 :             break;
    1467 EUB             : 
    1468 UIC           0 :         case SS3:               /* JIS X 0212 */
    1469               0 :             if (length != 3)
    1470               0 :                 return false;
    1471 ECB             : 
    1472 LBC           0 :             for (i = 2; i > 0; i--)
    1473 EUB             :             {
    1474 LBC           0 :                 c2 = charptr[i];
    1475               0 :                 if (c2 < 0xa1)
    1476                 :                 {
    1477 UIC           0 :                     charptr[i] = 0xa1;
    1478 LBC           0 :                     return true;
    1479                 :                 }
    1480 UIC           0 :                 else if (c2 < 0xfe)
    1481                 :                 {
    1482               0 :                     charptr[i]++;
    1483               0 :                     return true;
    1484                 :                 }
    1485                 :             }
    1486                 : 
    1487                 :             /* Out of 3-byte code region */
    1488               0 :             return false;
    1489                 : 
    1490               0 :         default:
    1491               0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1492                 :             {
    1493               0 :                 if (length != 2)
    1494               0 :                     return false;
    1495                 : 
    1496               0 :                 for (i = 1; i >= 0; i--)
    1497                 :                 {
    1498               0 :                     c2 = charptr[i];
    1499               0 :                     if (c2 < 0xa1)
    1500                 :                     {
    1501               0 :                         charptr[i] = 0xa1;
    1502 UBC           0 :                         return true;
    1503                 :                     }
    1504 UIC           0 :                     else if (c2 < 0xfe)
    1505                 :                     {
    1506               0 :                         charptr[i]++;
    1507               0 :                         return true;
    1508 EUB             :                     }
    1509                 :                 }
    1510                 : 
    1511                 :                 /* Out of 2 byte code region */
    1512 UBC           0 :                 return false;
    1513 EUB             :             }
    1514                 :             else
    1515                 :             {                   /* ASCII, single byte */
    1516 UBC           0 :                 if (c1 > 0x7e)
    1517 UIC           0 :                     return false;
    1518 UBC           0 :                 (*charptr)++;
    1519 EUB             :             }
    1520 UBC           0 :             break;
    1521 EUB             :     }
    1522                 : 
    1523 UBC           0 :     return true;
    1524 EUB             : }
    1525                 : 
    1526                 : /*
    1527                 :  * get the character incrementer for the encoding for the current database
    1528                 :  */
    1529                 : mbcharacter_incrementer
    1530 GBC         980 : pg_database_encoding_character_incrementer(void)
    1531                 : {
    1532 EUB             :     /*
    1533                 :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1534                 :      * now we just use a switch.
    1535                 :      */
    1536 GBC         980 :     switch (GetDatabaseEncoding())
    1537                 :     {
    1538             929 :         case PG_UTF8:
    1539 GIC         929 :             return pg_utf8_increment;
    1540 EUB             : 
    1541 UBC           0 :         case PG_EUC_JP:
    1542 UIC           0 :             return pg_eucjp_increment;
    1543                 : 
    1544 GIC          51 :         default:
    1545              51 :             return pg_generic_charinc;
    1546 EUB             :     }
    1547                 : }
    1548                 : 
    1549                 : /*
    1550                 :  * fetch maximum length of the encoding for the current database
    1551                 :  */
    1552                 : int
    1553 GIC     8841119 : pg_database_encoding_max_length(void)
    1554 EUB             : {
    1555 GIC     8841119 :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1556 EUB             : }
    1557                 : 
    1558                 : /*
    1559                 :  * Verify mbstr to make sure that it is validly encoded in the current
    1560                 :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1561                 :  */
    1562                 : bool
    1563 GIC      152232 : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1564 EUB             : {
    1565 GBC      152232 :     return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
    1566                 : }
    1567                 : 
    1568                 : /*
    1569                 :  * Verify mbstr to make sure that it is validly encoded in the specified
    1570 EUB             :  * encoding.
    1571                 :  */
    1572                 : bool
    1573 GIC     1054176 : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1574 EUB             : {
    1575                 :     int         oklen;
    1576                 : 
    1577 GIC     1054176 :     Assert(PG_VALID_ENCODING(encoding));
    1578 EUB             : 
    1579 GIC     1054176 :     oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
    1580         1054176 :     if (oklen != len)
    1581 EUB             :     {
    1582 GIC           4 :         if (noError)
    1583 UIC           0 :             return false;
    1584 GIC           4 :         report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
    1585                 :     }
    1586         1054172 :     return true;
    1587                 : }
    1588 ECB             : 
    1589                 : /*
    1590                 :  * Verify mbstr to make sure that it is validly encoded in the specified
    1591                 :  * encoding.
    1592                 :  *
    1593                 :  * mbstr is not necessarily zero terminated; length of mbstr is
    1594                 :  * specified by len.
    1595                 :  *
    1596                 :  * If OK, return length of string in the encoding.
    1597                 :  * If a problem is found, return -1 when noError is
    1598                 :  * true; when noError is false, ereport() a descriptive message.
    1599 EUB             :  *
    1600                 :  * Note: We cannot use the faster encoding-specific mbverifystr() function
    1601                 :  * here, because we need to count the number of characters in the string.
    1602 ECB             :  */
    1603                 : int
    1604 UIC           0 : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1605                 : {
    1606                 :     mbchar_verifier mbverifychar;
    1607                 :     int         mb_len;
    1608                 : 
    1609               0 :     Assert(PG_VALID_ENCODING(encoding));
    1610                 : 
    1611 ECB             :     /*
    1612                 :      * In single-byte encodings, we need only reject nulls (\0).
    1613                 :      */
    1614 UIC           0 :     if (pg_encoding_max_length(encoding) <= 1)
    1615                 :     {
    1616               0 :         const char *nullpos = memchr(mbstr, 0, len);
    1617                 : 
    1618               0 :         if (nullpos == NULL)
    1619               0 :             return len;
    1620               0 :         if (noError)
    1621 LBC           0 :             return -1;
    1622 UIC           0 :         report_invalid_encoding(encoding, nullpos, 1);
    1623 ECB             :     }
    1624                 : 
    1625                 :     /* fetch function pointer just once */
    1626 UIC           0 :     mbverifychar = pg_wchar_table[encoding].mbverifychar;
    1627                 : 
    1628               0 :     mb_len = 0;
    1629                 : 
    1630               0 :     while (len > 0)
    1631 ECB             :     {
    1632                 :         int         l;
    1633                 : 
    1634                 :         /* fast path for ASCII-subset characters */
    1635 LBC           0 :         if (!IS_HIGHBIT_SET(*mbstr))
    1636                 :         {
    1637               0 :             if (*mbstr != '\0')
    1638 ECB             :             {
    1639 UIC           0 :                 mb_len++;
    1640 LBC           0 :                 mbstr++;
    1641 UBC           0 :                 len--;
    1642 LBC           0 :                 continue;
    1643                 :             }
    1644               0 :             if (noError)
    1645 UIC           0 :                 return -1;
    1646               0 :             report_invalid_encoding(encoding, mbstr, len);
    1647                 :         }
    1648                 : 
    1649               0 :         l = (*mbverifychar) ((const unsigned char *) mbstr, len);
    1650                 : 
    1651               0 :         if (l < 0)
    1652                 :         {
    1653               0 :             if (noError)
    1654               0 :                 return -1;
    1655               0 :             report_invalid_encoding(encoding, mbstr, len);
    1656                 :         }
    1657                 : 
    1658               0 :         mbstr += l;
    1659               0 :         len -= l;
    1660               0 :         mb_len++;
    1661                 :     }
    1662 UBC           0 :     return mb_len;
    1663                 : }
    1664                 : 
    1665                 : /*
    1666                 :  * check_encoding_conversion_args: check arguments of a conversion function
    1667 EUB             :  *
    1668                 :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1669                 :  * the caller will check whether it accepts the ID.
    1670                 :  *
    1671                 :  * Note: the errors here are not really user-facing, so elog instead of
    1672                 :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1673                 :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1674                 :  */
    1675                 : void
    1676 GBC        3545 : check_encoding_conversion_args(int src_encoding,
    1677 EUB             :                                int dest_encoding,
    1678                 :                                int len,
    1679                 :                                int expected_src_encoding,
    1680                 :                                int expected_dest_encoding)
    1681                 : {
    1682 GIC        3545 :     if (!PG_VALID_ENCODING(src_encoding))
    1683 UIC           0 :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1684 GBC        3545 :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1685 UIC           0 :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1686 EUB             :              pg_enc2name_tbl[expected_src_encoding].name,
    1687                 :              pg_enc2name_tbl[src_encoding].name);
    1688 GBC        3545 :     if (!PG_VALID_ENCODING(dest_encoding))
    1689 UIC           0 :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1690 GIC        3545 :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1691 UIC           0 :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1692                 :              pg_enc2name_tbl[expected_dest_encoding].name,
    1693 EUB             :              pg_enc2name_tbl[dest_encoding].name);
    1694 GIC        3545 :     if (len < 0)
    1695 UBC           0 :         elog(ERROR, "encoding conversion length must not be negative");
    1696 GIC        3545 : }
    1697 EUB             : 
    1698                 : /*
    1699                 :  * report_invalid_encoding: complain about invalid multibyte character
    1700                 :  *
    1701                 :  * note: len is remaining length of string, not length of character;
    1702                 :  * len must be greater than zero, as we always examine the first byte.
    1703                 :  */
    1704                 : void
    1705 GIC        1471 : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1706                 : {
    1707 GBC        1471 :     int         l = pg_encoding_mblen(encoding, mbstr);
    1708                 :     char        buf[8 * 5 + 1];
    1709            1471 :     char       *p = buf;
    1710                 :     int         j,
    1711 EUB             :                 jlimit;
    1712                 : 
    1713 GBC        1471 :     jlimit = Min(l, len);
    1714 GIC        1471 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1715                 : 
    1716 GBC        4559 :     for (j = 0; j < jlimit; j++)
    1717 EUB             :     {
    1718 GBC        3088 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1719 GIC        3088 :         if (j < jlimit - 1)
    1720 GBC        1617 :             p += sprintf(p, " ");
    1721                 :     }
    1722                 : 
    1723 GIC        1471 :     ereport(ERROR,
    1724                 :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    1725                 :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    1726                 :                     pg_enc2name_tbl[encoding].name,
    1727                 :                     buf)));
    1728                 : }
    1729                 : 
    1730                 : /*
    1731                 :  * report_untranslatable_char: complain about untranslatable character
    1732                 :  *
    1733                 :  * note: len is remaining length of string, not length of character;
    1734 ECB             :  * len must be greater than zero, as we always examine the first byte.
    1735                 :  */
    1736                 : void
    1737 GIC         468 : report_untranslatable_char(int src_encoding, int dest_encoding,
    1738                 :                            const char *mbstr, int len)
    1739                 : {
    1740 CBC         468 :     int         l = pg_encoding_mblen(src_encoding, mbstr);
    1741 EUB             :     char        buf[8 * 5 + 1];
    1742 CBC         468 :     char       *p = buf;
    1743 EUB             :     int         j,
    1744                 :                 jlimit;
    1745                 : 
    1746 CBC         468 :     jlimit = Min(l, len);
    1747 GBC         468 :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    1748 ECB             : 
    1749 GBC        1764 :     for (j = 0; j < jlimit; j++)
    1750                 :     {
    1751 GIC        1296 :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    1752 CBC        1296 :         if (j < jlimit - 1)
    1753 GBC         828 :             p += sprintf(p, " ");
    1754 ECB             :     }
    1755                 : 
    1756 GIC         468 :     ereport(ERROR,
    1757                 :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    1758                 :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    1759                 :                     buf,
    1760                 :                     pg_enc2name_tbl[src_encoding].name,
    1761                 :                     pg_enc2name_tbl[dest_encoding].name)));
    1762                 : }
    1763 ECB             : 
    1764                 : 
    1765                 : #ifdef WIN32
    1766                 : /*
    1767                 :  * Convert from MessageEncoding to a palloc'ed, null-terminated utf16
    1768                 :  * string. The character length is also passed to utf16len if not
    1769                 :  * null. Returns NULL iff failed. Before MessageEncoding initialization, "str"
    1770                 :  * should be ASCII-only; this will function as though MessageEncoding is UTF8.
    1771                 :  */
    1772                 : WCHAR *
    1773                 : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1774                 : {
    1775                 :     int         msgenc = GetMessageEncoding();
    1776                 :     WCHAR      *utf16;
    1777                 :     int         dstlen;
    1778                 :     UINT        codepage;
    1779                 : 
    1780                 :     if (msgenc == PG_SQL_ASCII)
    1781                 :         /* No conversion is possible, and SQL_ASCII is never utf16. */
    1782                 :         return NULL;
    1783                 : 
    1784                 :     codepage = pg_enc2name_tbl[msgenc].codepage;
    1785                 : 
    1786                 :     /*
    1787                 :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1788                 :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1789                 :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1790                 :      */
    1791                 :     if (codepage != 0)
    1792                 :     {
    1793                 :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1794                 :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1795                 :         utf16[dstlen] = (WCHAR) 0;
    1796                 :     }
    1797                 :     else
    1798                 :     {
    1799                 :         char       *utf8;
    1800                 : 
    1801                 :         /*
    1802                 :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1803                 :          * absence of one, hope for the input to be valid UTF8.
    1804                 :          */
    1805                 :         if (IsTransactionState())
    1806                 :         {
    1807                 :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1808                 :                                                       len,
    1809                 :                                                       msgenc,
    1810                 :                                                       PG_UTF8);
    1811                 :             if (utf8 != str)
    1812                 :                 len = strlen(utf8);
    1813                 :         }
    1814                 :         else
    1815                 :             utf8 = (char *) str;
    1816                 : 
    1817                 :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1818                 :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1819                 :         utf16[dstlen] = (WCHAR) 0;
    1820                 : 
    1821                 :         if (utf8 != str)
    1822                 :             pfree(utf8);
    1823                 :     }
    1824                 : 
    1825                 :     if (dstlen == 0 && len > 0)
    1826                 :     {
    1827                 :         pfree(utf16);
    1828                 :         return NULL;            /* error */
    1829                 :     }
    1830                 : 
    1831                 :     if (utf16len)
    1832                 :         *utf16len = dstlen;
    1833                 :     return utf16;
    1834                 : }
    1835                 : 
    1836                 : #endif                          /* WIN32 */
        

Generated by: LCOV version v1.16-55-g56c0a2a