LCOV - differential code coverage report
Current view: top level - src/port - chklocale.c (source / functions) Coverage Total Hit LBC UIC GBC GIC CBC EUB ECB
Current: Differential Code Coverage HEAD vs 15 Lines: 82.9 % 41 34 1 6 1 23 10 6 23
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 1 1 1 1
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * chklocale.c
       4                 :  *      Functions for handling locale-related info
       5                 :  *
       6                 :  *
       7                 :  * Copyright (c) 1996-2023, PostgreSQL Global Development Group
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/port/chklocale.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #ifndef FRONTEND
      17                 : #include "postgres.h"
      18                 : #else
      19                 : #include "postgres_fe.h"
      20                 : #endif
      21                 : 
      22                 : #ifdef HAVE_LANGINFO_H
      23                 : #include <langinfo.h>
      24                 : #endif
      25                 : 
      26                 : #include "mb/pg_wchar.h"
      27                 : 
      28                 : 
      29                 : /*
      30                 :  * This table needs to recognize all the CODESET spellings for supported
      31                 :  * backend encodings, as well as frontend-only encodings where possible
      32                 :  * (the latter case is currently only needed for initdb to recognize
      33                 :  * error situations).  On Windows, we rely on entries for codepage
      34                 :  * numbers (CPnnn).
      35                 :  *
      36                 :  * Note that we search the table with pg_strcasecmp(), so variant
      37                 :  * capitalizations don't need their own entries.
      38                 :  */
      39                 : struct encoding_match
      40                 : {
      41                 :     enum pg_enc pg_enc_code;
      42                 :     const char *system_enc_name;
      43                 : };
      44                 : 
      45                 : static const struct encoding_match encoding_match_list[] = {
      46                 :     {PG_EUC_JP, "EUC-JP"},
      47                 :     {PG_EUC_JP, "eucJP"},
      48                 :     {PG_EUC_JP, "IBM-eucJP"},
      49                 :     {PG_EUC_JP, "sdeckanji"},
      50                 :     {PG_EUC_JP, "CP20932"},
      51                 : 
      52                 :     {PG_EUC_CN, "EUC-CN"},
      53                 :     {PG_EUC_CN, "eucCN"},
      54                 :     {PG_EUC_CN, "IBM-eucCN"},
      55                 :     {PG_EUC_CN, "GB2312"},
      56                 :     {PG_EUC_CN, "dechanzi"},
      57                 :     {PG_EUC_CN, "CP20936"},
      58                 : 
      59                 :     {PG_EUC_KR, "EUC-KR"},
      60                 :     {PG_EUC_KR, "eucKR"},
      61                 :     {PG_EUC_KR, "IBM-eucKR"},
      62                 :     {PG_EUC_KR, "deckorean"},
      63                 :     {PG_EUC_KR, "5601"},
      64                 :     {PG_EUC_KR, "CP51949"},
      65                 : 
      66                 :     {PG_EUC_TW, "EUC-TW"},
      67                 :     {PG_EUC_TW, "eucTW"},
      68                 :     {PG_EUC_TW, "IBM-eucTW"},
      69                 :     {PG_EUC_TW, "cns11643"},
      70                 :     /* No codepage for EUC-TW ? */
      71                 : 
      72                 :     {PG_UTF8, "UTF-8"},
      73                 :     {PG_UTF8, "utf8"},
      74                 :     {PG_UTF8, "CP65001"},
      75                 : 
      76                 :     {PG_LATIN1, "ISO-8859-1"},
      77                 :     {PG_LATIN1, "ISO8859-1"},
      78                 :     {PG_LATIN1, "iso88591"},
      79                 :     {PG_LATIN1, "CP28591"},
      80                 : 
      81                 :     {PG_LATIN2, "ISO-8859-2"},
      82                 :     {PG_LATIN2, "ISO8859-2"},
      83                 :     {PG_LATIN2, "iso88592"},
      84                 :     {PG_LATIN2, "CP28592"},
      85                 : 
      86                 :     {PG_LATIN3, "ISO-8859-3"},
      87                 :     {PG_LATIN3, "ISO8859-3"},
      88                 :     {PG_LATIN3, "iso88593"},
      89                 :     {PG_LATIN3, "CP28593"},
      90                 : 
      91                 :     {PG_LATIN4, "ISO-8859-4"},
      92                 :     {PG_LATIN4, "ISO8859-4"},
      93                 :     {PG_LATIN4, "iso88594"},
      94                 :     {PG_LATIN4, "CP28594"},
      95                 : 
      96                 :     {PG_LATIN5, "ISO-8859-9"},
      97                 :     {PG_LATIN5, "ISO8859-9"},
      98                 :     {PG_LATIN5, "iso88599"},
      99                 :     {PG_LATIN5, "CP28599"},
     100                 : 
     101                 :     {PG_LATIN6, "ISO-8859-10"},
     102                 :     {PG_LATIN6, "ISO8859-10"},
     103                 :     {PG_LATIN6, "iso885910"},
     104                 : 
     105                 :     {PG_LATIN7, "ISO-8859-13"},
     106                 :     {PG_LATIN7, "ISO8859-13"},
     107                 :     {PG_LATIN7, "iso885913"},
     108                 : 
     109                 :     {PG_LATIN8, "ISO-8859-14"},
     110                 :     {PG_LATIN8, "ISO8859-14"},
     111                 :     {PG_LATIN8, "iso885914"},
     112                 : 
     113                 :     {PG_LATIN9, "ISO-8859-15"},
     114                 :     {PG_LATIN9, "ISO8859-15"},
     115                 :     {PG_LATIN9, "iso885915"},
     116                 :     {PG_LATIN9, "CP28605"},
     117                 : 
     118                 :     {PG_LATIN10, "ISO-8859-16"},
     119                 :     {PG_LATIN10, "ISO8859-16"},
     120                 :     {PG_LATIN10, "iso885916"},
     121                 : 
     122                 :     {PG_KOI8R, "KOI8-R"},
     123                 :     {PG_KOI8R, "CP20866"},
     124                 : 
     125                 :     {PG_KOI8U, "KOI8-U"},
     126                 :     {PG_KOI8U, "CP21866"},
     127                 : 
     128                 :     {PG_WIN866, "CP866"},
     129                 :     {PG_WIN874, "CP874"},
     130                 :     {PG_WIN1250, "CP1250"},
     131                 :     {PG_WIN1251, "CP1251"},
     132                 :     {PG_WIN1251, "ansi-1251"},
     133                 :     {PG_WIN1252, "CP1252"},
     134                 :     {PG_WIN1253, "CP1253"},
     135                 :     {PG_WIN1254, "CP1254"},
     136                 :     {PG_WIN1255, "CP1255"},
     137                 :     {PG_WIN1256, "CP1256"},
     138                 :     {PG_WIN1257, "CP1257"},
     139                 :     {PG_WIN1258, "CP1258"},
     140                 : 
     141                 :     {PG_ISO_8859_5, "ISO-8859-5"},
     142                 :     {PG_ISO_8859_5, "ISO8859-5"},
     143                 :     {PG_ISO_8859_5, "iso88595"},
     144                 :     {PG_ISO_8859_5, "CP28595"},
     145                 : 
     146                 :     {PG_ISO_8859_6, "ISO-8859-6"},
     147                 :     {PG_ISO_8859_6, "ISO8859-6"},
     148                 :     {PG_ISO_8859_6, "iso88596"},
     149                 :     {PG_ISO_8859_6, "CP28596"},
     150                 : 
     151                 :     {PG_ISO_8859_7, "ISO-8859-7"},
     152                 :     {PG_ISO_8859_7, "ISO8859-7"},
     153                 :     {PG_ISO_8859_7, "iso88597"},
     154                 :     {PG_ISO_8859_7, "CP28597"},
     155                 : 
     156                 :     {PG_ISO_8859_8, "ISO-8859-8"},
     157                 :     {PG_ISO_8859_8, "ISO8859-8"},
     158                 :     {PG_ISO_8859_8, "iso88598"},
     159                 :     {PG_ISO_8859_8, "CP28598"},
     160                 : 
     161                 :     {PG_SJIS, "SJIS"},
     162                 :     {PG_SJIS, "PCK"},
     163                 :     {PG_SJIS, "CP932"},
     164                 :     {PG_SJIS, "SHIFT_JIS"},
     165                 : 
     166                 :     {PG_BIG5, "BIG5"},
     167                 :     {PG_BIG5, "BIG5HKSCS"},
     168                 :     {PG_BIG5, "Big5-HKSCS"},
     169                 :     {PG_BIG5, "CP950"},
     170                 : 
     171                 :     {PG_GBK, "GBK"},
     172                 :     {PG_GBK, "CP936"},
     173                 : 
     174                 :     {PG_UHC, "UHC"},
     175                 :     {PG_UHC, "CP949"},
     176                 : 
     177                 :     {PG_JOHAB, "JOHAB"},
     178                 :     {PG_JOHAB, "CP1361"},
     179                 : 
     180                 :     {PG_GB18030, "GB18030"},
     181                 :     {PG_GB18030, "CP54936"},
     182                 : 
     183                 :     {PG_SHIFT_JIS_2004, "SJIS_2004"},
     184                 : 
     185                 :     {PG_SQL_ASCII, "US-ASCII"},
     186                 : 
     187                 :     {PG_SQL_ASCII, NULL}        /* end marker */
     188                 : };
     189                 : 
     190                 : #ifdef WIN32
     191                 : /*
     192                 :  * On Windows, use CP<code page number> instead of the nl_langinfo() result
     193                 :  *
     194                 :  * This routine uses GetLocaleInfoEx() to parse short locale names like
     195                 :  * "de-DE", "fr-FR", etc.  If those cannot be parsed correctly process falls
     196                 :  * back to the pre-VS-2010 manual parsing done with using
     197                 :  * <Language>_<Country>.<CodePage> as a base.
     198                 :  *
     199                 :  * Returns a malloc()'d string for the caller to free.
     200                 :  */
     201                 : static char *
     202                 : win32_langinfo(const char *ctype)
     203                 : {
     204                 :     char       *r = NULL;
     205                 :     char       *codepage;
     206                 : 
     207                 : #if defined(_MSC_VER)
     208                 :     uint32      cp;
     209                 :     WCHAR       wctype[LOCALE_NAME_MAX_LENGTH];
     210                 : 
     211                 :     memset(wctype, 0, sizeof(wctype));
     212                 :     MultiByteToWideChar(CP_ACP, 0, ctype, -1, wctype, LOCALE_NAME_MAX_LENGTH);
     213                 : 
     214                 :     if (GetLocaleInfoEx(wctype,
     215                 :                         LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
     216                 :                         (LPWSTR) &cp, sizeof(cp) / sizeof(WCHAR)) > 0)
     217                 :     {
     218                 :         r = malloc(16);         /* excess */
     219                 :         if (r != NULL)
     220                 :         {
     221                 :             /*
     222                 :              * If the return value is CP_ACP that means no ANSI code page is
     223                 :              * available, so only Unicode can be used for the locale.
     224                 :              */
     225                 :             if (cp == CP_ACP)
     226                 :                 strcpy(r, "utf8");
     227                 :             else
     228                 :                 sprintf(r, "CP%u", cp);
     229                 :         }
     230                 :     }
     231                 :     else
     232                 : #endif
     233                 :     {
     234                 :         /*
     235                 :          * Locale format on Win32 is <Language>_<Country>.<CodePage>.  For
     236                 :          * example, English_United States.1252.  If we see digits after the
     237                 :          * last dot, assume it's a codepage number.  Otherwise, we might be
     238                 :          * dealing with a Unix-style locale string; Windows' setlocale() will
     239                 :          * take those even though GetLocaleInfoEx() won't, so we end up here.
     240                 :          * In that case, just return what's after the last dot and hope we can
     241                 :          * find it in our table.
     242                 :          */
     243                 :         codepage = strrchr(ctype, '.');
     244                 :         if (codepage != NULL)
     245                 :         {
     246                 :             size_t      ln;
     247                 : 
     248                 :             codepage++;
     249                 :             ln = strlen(codepage);
     250                 :             r = malloc(ln + 3);
     251                 :             if (r != NULL)
     252                 :             {
     253                 :                 if (strspn(codepage, "0123456789") == ln)
     254                 :                     sprintf(r, "CP%s", codepage);
     255                 :                 else
     256                 :                     strcpy(r, codepage);
     257                 :             }
     258                 :         }
     259                 :     }
     260                 : 
     261                 :     return r;
     262                 : }
     263                 : 
     264                 : #ifndef FRONTEND
     265                 : /*
     266                 :  * Given a Windows code page identifier, find the corresponding PostgreSQL
     267                 :  * encoding.  Issue a warning and return -1 if none found.
     268                 :  */
     269                 : int
     270                 : pg_codepage_to_encoding(UINT cp)
     271                 : {
     272                 :     char        sys[16];
     273                 :     int         i;
     274                 : 
     275                 :     sprintf(sys, "CP%u", cp);
     276                 : 
     277                 :     /* Check the table */
     278                 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     279                 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     280                 :             return encoding_match_list[i].pg_enc_code;
     281                 : 
     282                 :     ereport(WARNING,
     283 ECB             :             (errmsg("could not determine encoding for codeset \"%s\"", sys)));
     284                 : 
     285                 :     return -1;
     286                 : }
     287                 : #endif
     288                 : #endif                          /* WIN32 */
     289                 : 
     290                 : #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32)
     291                 : 
     292                 : /*
     293                 :  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
     294                 :  * encoding, if we can determine it.  Return -1 if we can't determine it.
     295                 :  *
     296                 :  * Pass in NULL to get the encoding for the current locale setting.
     297                 :  * Pass "" to get the encoding selected by the server's environment.
     298                 :  *
     299                 :  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
     300                 :  * with any desired encoding.
     301 EUB             :  *
     302                 :  * If running in the backend and write_message is false, this function must
     303 ECB             :  * cope with the possibility that elog() and palloc() are not yet usable.
     304                 :  */
     305 EUB             : int
     306 GIC      167798 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     307 ECB             : {
     308                 :     char       *sys;
     309                 :     int         i;
     310                 : 
     311                 :     /* Get the CODESET property, and also LC_CTYPE if not passed in */
     312 GIC      167798 :     if (ctype)
     313                 :     {
     314                 :         char       *save;
     315 ECB             :         char       *name;
     316                 : 
     317                 :         /* If locale is C or POSIX, we can allow all encodings */
     318 GIC      309011 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     319          154199 :             pg_strcasecmp(ctype, "POSIX") == 0)
     320             940 :             return PG_SQL_ASCII;
     321                 : 
     322 CBC      153872 :         save = setlocale(LC_CTYPE, NULL);
     323          153872 :         if (!save)
     324 UIC           0 :             return -1;          /* setlocale() broken? */
     325                 :         /* must copy result, or it might change after setlocale */
     326 GIC      153872 :         save = strdup(save);
     327          153872 :         if (!save)
     328 LBC           0 :             return -1;          /* out of memory; unlikely */
     329 ECB             : 
     330 GBC      153872 :         name = setlocale(LC_CTYPE, ctype);
     331 GIC      153872 :         if (!name)
     332                 :         {
     333 CBC          12 :             free(save);
     334              12 :             return -1;          /* bogus ctype passed in? */
     335 ECB             :         }
     336                 : 
     337                 : #ifndef WIN32
     338 CBC      153860 :         sys = nl_langinfo(CODESET);
     339          153860 :         if (sys)
     340          153860 :             sys = strdup(sys);
     341                 : #else
     342                 :         sys = win32_langinfo(name);
     343                 : #endif
     344                 : 
     345 GIC      153860 :         setlocale(LC_CTYPE, save);
     346 CBC      153860 :         free(save);
     347 EUB             :     }
     348                 :     else
     349                 :     {
     350 ECB             :         /* much easier... */
     351 GIC       12986 :         ctype = setlocale(LC_CTYPE, NULL);
     352 CBC       12986 :         if (!ctype)
     353 UIC           0 :             return -1;          /* setlocale() broken? */
     354 ECB             : 
     355                 :         /* If locale is C or POSIX, we can allow all encodings */
     356 GIC       25873 :         if (pg_strcasecmp(ctype, "C") == 0 ||
     357           12887 :             pg_strcasecmp(ctype, "POSIX") == 0)
     358              99 :             return PG_SQL_ASCII;
     359                 : 
     360                 : #ifndef WIN32
     361           12887 :         sys = nl_langinfo(CODESET);
     362           12887 :         if (sys)
     363           12887 :             sys = strdup(sys);
     364                 : #else
     365                 :         sys = win32_langinfo(ctype);
     366                 : #endif
     367                 :     }
     368                 : 
     369          166747 :     if (!sys)
     370 UIC           0 :         return -1;              /* out of memory; unlikely */
     371                 : 
     372                 :     /* Check the table */
     373 GIC     4925335 :     for (i = 0; encoding_match_list[i].system_enc_name; i++)
     374                 :     {
     375         4923517 :         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
     376                 :         {
     377          164929 :             free(sys);
     378 CBC      164929 :             return encoding_match_list[i].pg_enc_code;
     379                 :         }
     380                 :     }
     381 EUB             : 
     382                 :     /* Special-case kluges for particular platforms go here */
     383                 : 
     384                 : #ifdef __darwin__
     385                 : 
     386                 :     /*
     387                 :      * Current macOS has many locales that report an empty string for CODESET,
     388                 :      * but they all seem to actually use UTF-8.
     389                 :      */
     390                 :     if (strlen(sys) == 0)
     391                 :     {
     392 ECB             :         free(sys);
     393                 :         return PG_UTF8;
     394                 :     }
     395                 : #endif
     396                 : 
     397                 :     /*
     398                 :      * We print a warning if we got a CODESET string but couldn't recognize
     399                 :      * it.  This means we need another entry in the table.
     400                 :      */
     401 GIC        1818 :     if (write_message)
     402                 :     {
     403                 : #ifdef FRONTEND
     404 UIC           0 :         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
     405                 :                 ctype, sys);
     406                 :         /* keep newline separate so there's only one translatable string */
     407               0 :         fputc('\n', stderr);
     408                 : #else
     409               0 :         ereport(WARNING,
     410                 :                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
     411                 :                         ctype, sys)));
     412                 : #endif
     413                 :     }
     414                 : 
     415 GIC        1818 :     free(sys);
     416            1818 :     return -1;
     417                 : }
     418                 : #else                           /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
     419                 : 
     420                 : /*
     421                 :  * stub if no multi-language platform support
     422                 :  *
     423                 :  * Note: we could return -1 here, but that would have the effect of
     424                 :  * forcing users to specify an encoding to initdb on such platforms.
     425                 :  * It seems better to silently default to SQL_ASCII.
     426                 :  */
     427                 : int
     428                 : pg_get_encoding_from_locale(const char *ctype, bool write_message)
     429                 : {
     430                 :     return PG_SQL_ASCII;
     431                 : }
     432                 : 
     433                 : #endif                          /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
        

Generated by: LCOV version v1.16-55-g56c0a2a