LCOV - differential code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 51.8 % 866 449 4 2 411 1 34 1 413 5 31 1
Current Date: 2023-04-08 15:15:32 Functions: 61.7 % 81 50 31 7 2 41 7
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * wchar.c
       4                 :  *    Functions for working with multibyte characters in various encodings.
       5                 :  *
       6                 :  * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  * IDENTIFICATION
       9                 :  *    src/common/wchar.c
      10                 :  *
      11                 :  *-------------------------------------------------------------------------
      12                 :  */
      13                 : #include "c.h"
      14                 : 
      15                 : #include "mb/pg_wchar.h"
      16                 : 
      17                 : 
      18                 : /*
      19                 :  * Operations on multi-byte encodings are driven by a table of helper
      20                 :  * functions.
      21                 :  *
      22                 :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
      23                 :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
      24                 :  * and wchar2mb() conversion functions.
      25                 :  *
      26                 :  * These functions generally assume that their input is validly formed.
      27                 :  * The "verifier" functions, further down in the file, have to be more
      28                 :  * paranoid.
      29                 :  *
      30                 :  * We expect that mblen() does not need to examine more than the first byte
      31                 :  * of the character to discover the correct length.  GB18030 is an exception
      32                 :  * to that rule, though, as it also looks at second byte.  But even that
      33                 :  * behaves in a predictable way, if you only pass the first byte: it will
      34                 :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
      35                 :  * good enough for all current uses.
      36                 :  *
      37                 :  * Note: for the display output of psql to work properly, the return values
      38                 :  * of the dsplen functions must conform to the Unicode standard. In particular
      39                 :  * the NUL character is zero width and control characters are generally
      40                 :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      41                 :  * subset to the ASCII routines to ensure consistency.
      42                 :  */
      43                 : 
      44                 : /*
      45                 :  * SQL/ASCII
      46                 :  */
      47                 : static int
      48 CBC         246 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      49                 : {
      50             246 :     int         cnt = 0;
      51                 : 
      52            4657 :     while (len > 0 && *from)
      53                 :     {
      54            4411 :         *to++ = *from++;
      55            4411 :         len--;
      56            4411 :         cnt++;
      57                 :     }
      58             246 :     *to = 0;
      59             246 :     return cnt;
      60                 : }
      61                 : 
      62                 : static int
      63             837 : pg_ascii_mblen(const unsigned char *s)
      64                 : {
      65             837 :     return 1;
      66                 : }
      67                 : 
      68                 : static int
      69             400 : pg_ascii_dsplen(const unsigned char *s)
      70                 : {
      71             400 :     if (*s == '\0')
      72 UBC           0 :         return 0;
      73 CBC         400 :     if (*s < 0x20 || *s == 0x7f)
      74 UBC           0 :         return -1;
      75                 : 
      76 CBC         400 :     return 1;
      77                 : }
      78                 : 
      79                 : /*
      80                 :  * EUC
      81                 :  */
      82                 : static int
      83 UBC           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      84                 : {
      85               0 :     int         cnt = 0;
      86                 : 
      87               0 :     while (len > 0 && *from)
      88                 :     {
      89               0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
      90                 :                                          * KANA") */
      91                 :         {
      92               0 :             from++;
      93               0 :             *to = (SS2 << 8) | *from++;
      94               0 :             len -= 2;
      95                 :         }
      96               0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
      97                 :         {
      98               0 :             from++;
      99               0 :             *to = (SS3 << 16) | (*from++ << 8);
     100               0 :             *to |= *from++;
     101               0 :             len -= 3;
     102                 :         }
     103               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
     104                 :         {
     105               0 :             *to = *from++ << 8;
     106               0 :             *to |= *from++;
     107               0 :             len -= 2;
     108                 :         }
     109                 :         else                    /* must be ASCII */
     110                 :         {
     111               0 :             *to = *from++;
     112               0 :             len--;
     113                 :         }
     114               0 :         to++;
     115               0 :         cnt++;
     116                 :     }
     117               0 :     *to = 0;
     118               0 :     return cnt;
     119                 : }
     120                 : 
     121                 : static inline int
     122 CBC          90 : pg_euc_mblen(const unsigned char *s)
     123                 : {
     124                 :     int         len;
     125                 : 
     126              90 :     if (*s == SS2)
     127 UBC           0 :         len = 2;
     128 CBC          90 :     else if (*s == SS3)
     129 UBC           0 :         len = 3;
     130 CBC          90 :     else if (IS_HIGHBIT_SET(*s))
     131              54 :         len = 2;
     132                 :     else
     133              36 :         len = 1;
     134              90 :     return len;
     135                 : }
     136                 : 
     137                 : static inline int
     138 UBC           0 : pg_euc_dsplen(const unsigned char *s)
     139                 : {
     140                 :     int         len;
     141                 : 
     142               0 :     if (*s == SS2)
     143               0 :         len = 2;
     144               0 :     else if (*s == SS3)
     145               0 :         len = 2;
     146               0 :     else if (IS_HIGHBIT_SET(*s))
     147               0 :         len = 2;
     148                 :     else
     149               0 :         len = pg_ascii_dsplen(s);
     150               0 :     return len;
     151                 : }
     152                 : 
     153                 : /*
     154                 :  * EUC_JP
     155                 :  */
     156                 : static int
     157               0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     158                 : {
     159               0 :     return pg_euc2wchar_with_len(from, to, len);
     160                 : }
     161                 : 
     162                 : static int
     163 CBC          90 : pg_eucjp_mblen(const unsigned char *s)
     164                 : {
     165              90 :     return pg_euc_mblen(s);
     166                 : }
     167                 : 
     168                 : static int
     169 UBC           0 : pg_eucjp_dsplen(const unsigned char *s)
     170                 : {
     171                 :     int         len;
     172                 : 
     173               0 :     if (*s == SS2)
     174               0 :         len = 1;
     175               0 :     else if (*s == SS3)
     176               0 :         len = 2;
     177               0 :     else if (IS_HIGHBIT_SET(*s))
     178               0 :         len = 2;
     179                 :     else
     180               0 :         len = pg_ascii_dsplen(s);
     181               0 :     return len;
     182                 : }
     183                 : 
     184                 : /*
     185                 :  * EUC_KR
     186                 :  */
     187                 : static int
     188               0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     189                 : {
     190               0 :     return pg_euc2wchar_with_len(from, to, len);
     191                 : }
     192                 : 
     193                 : static int
     194               0 : pg_euckr_mblen(const unsigned char *s)
     195                 : {
     196               0 :     return pg_euc_mblen(s);
     197                 : }
     198                 : 
     199                 : static int
     200               0 : pg_euckr_dsplen(const unsigned char *s)
     201                 : {
     202               0 :     return pg_euc_dsplen(s);
     203                 : }
     204                 : 
     205                 : /*
     206                 :  * EUC_CN
     207                 :  *
     208                 :  */
     209                 : static int
     210               0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     211                 : {
     212               0 :     int         cnt = 0;
     213                 : 
     214               0 :     while (len > 0 && *from)
     215                 :     {
     216               0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
     217                 :         {
     218               0 :             from++;
     219               0 :             *to = (SS2 << 16) | (*from++ << 8);
     220               0 :             *to |= *from++;
     221               0 :             len -= 3;
     222                 :         }
     223               0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     224                 :         {
     225               0 :             from++;
     226               0 :             *to = (SS3 << 16) | (*from++ << 8);
     227               0 :             *to |= *from++;
     228               0 :             len -= 3;
     229                 :         }
     230               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     231                 :         {
     232               0 :             *to = *from++ << 8;
     233               0 :             *to |= *from++;
     234               0 :             len -= 2;
     235                 :         }
     236                 :         else
     237                 :         {
     238               0 :             *to = *from++;
     239               0 :             len--;
     240                 :         }
     241               0 :         to++;
     242               0 :         cnt++;
     243                 :     }
     244               0 :     *to = 0;
     245               0 :     return cnt;
     246                 : }
     247                 : 
     248                 : static int
     249               0 : pg_euccn_mblen(const unsigned char *s)
     250                 : {
     251                 :     int         len;
     252                 : 
     253               0 :     if (IS_HIGHBIT_SET(*s))
     254               0 :         len = 2;
     255                 :     else
     256               0 :         len = 1;
     257               0 :     return len;
     258                 : }
     259                 : 
     260                 : static int
     261               0 : pg_euccn_dsplen(const unsigned char *s)
     262                 : {
     263                 :     int         len;
     264                 : 
     265               0 :     if (IS_HIGHBIT_SET(*s))
     266               0 :         len = 2;
     267                 :     else
     268               0 :         len = pg_ascii_dsplen(s);
     269               0 :     return len;
     270                 : }
     271                 : 
     272                 : /*
     273                 :  * EUC_TW
     274                 :  *
     275                 :  */
     276                 : static int
     277               0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     278                 : {
     279               0 :     int         cnt = 0;
     280                 : 
     281               0 :     while (len > 0 && *from)
     282                 :     {
     283               0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
     284                 :         {
     285               0 :             from++;
     286               0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     287               0 :             *to |= *from++ << 8;
     288               0 :             *to |= *from++;
     289               0 :             len -= 4;
     290                 :         }
     291               0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     292                 :         {
     293               0 :             from++;
     294               0 :             *to = (SS3 << 16) | (*from++ << 8);
     295               0 :             *to |= *from++;
     296               0 :             len -= 3;
     297                 :         }
     298               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     299                 :         {
     300               0 :             *to = *from++ << 8;
     301               0 :             *to |= *from++;
     302               0 :             len -= 2;
     303                 :         }
     304                 :         else
     305                 :         {
     306               0 :             *to = *from++;
     307               0 :             len--;
     308                 :         }
     309               0 :         to++;
     310               0 :         cnt++;
     311                 :     }
     312               0 :     *to = 0;
     313               0 :     return cnt;
     314                 : }
     315                 : 
     316                 : static int
     317               0 : pg_euctw_mblen(const unsigned char *s)
     318                 : {
     319                 :     int         len;
     320                 : 
     321               0 :     if (*s == SS2)
     322               0 :         len = 4;
     323               0 :     else if (*s == SS3)
     324               0 :         len = 3;
     325               0 :     else if (IS_HIGHBIT_SET(*s))
     326               0 :         len = 2;
     327                 :     else
     328               0 :         len = 1;
     329               0 :     return len;
     330                 : }
     331                 : 
     332                 : static int
     333               0 : pg_euctw_dsplen(const unsigned char *s)
     334                 : {
     335                 :     int         len;
     336                 : 
     337               0 :     if (*s == SS2)
     338               0 :         len = 2;
     339               0 :     else if (*s == SS3)
     340               0 :         len = 2;
     341               0 :     else if (IS_HIGHBIT_SET(*s))
     342               0 :         len = 2;
     343                 :     else
     344               0 :         len = pg_ascii_dsplen(s);
     345               0 :     return len;
     346                 : }
     347                 : 
     348                 : /*
     349                 :  * Convert pg_wchar to EUC_* encoding.
     350                 :  * caller must allocate enough space for "to", including a trailing zero!
     351                 :  * len: length of from.
     352                 :  * "from" not necessarily null terminated.
     353                 :  */
     354                 : static int
     355               0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     356                 : {
     357               0 :     int         cnt = 0;
     358                 : 
     359               0 :     while (len > 0 && *from)
     360                 :     {
     361                 :         unsigned char c;
     362                 : 
     363               0 :         if ((c = (*from >> 24)))
     364                 :         {
     365               0 :             *to++ = c;
     366               0 :             *to++ = (*from >> 16) & 0xff;
     367               0 :             *to++ = (*from >> 8) & 0xff;
     368               0 :             *to++ = *from & 0xff;
     369               0 :             cnt += 4;
     370                 :         }
     371               0 :         else if ((c = (*from >> 16)))
     372                 :         {
     373               0 :             *to++ = c;
     374               0 :             *to++ = (*from >> 8) & 0xff;
     375               0 :             *to++ = *from & 0xff;
     376               0 :             cnt += 3;
     377                 :         }
     378               0 :         else if ((c = (*from >> 8)))
     379                 :         {
     380               0 :             *to++ = c;
     381               0 :             *to++ = *from & 0xff;
     382               0 :             cnt += 2;
     383                 :         }
     384                 :         else
     385                 :         {
     386               0 :             *to++ = *from;
     387               0 :             cnt++;
     388                 :         }
     389               0 :         from++;
     390               0 :         len--;
     391                 :     }
     392               0 :     *to = 0;
     393               0 :     return cnt;
     394                 : }
     395                 : 
     396                 : 
     397                 : /*
     398                 :  * JOHAB
     399                 :  */
     400                 : static int
     401               0 : pg_johab_mblen(const unsigned char *s)
     402                 : {
     403               0 :     return pg_euc_mblen(s);
     404                 : }
     405                 : 
     406                 : static int
     407               0 : pg_johab_dsplen(const unsigned char *s)
     408                 : {
     409               0 :     return pg_euc_dsplen(s);
     410                 : }
     411                 : 
     412                 : /*
     413                 :  * convert UTF8 string to pg_wchar (UCS-4)
     414                 :  * caller must allocate enough space for "to", including a trailing zero!
     415                 :  * len: length of from.
     416                 :  * "from" not necessarily null terminated.
     417                 :  */
     418                 : static int
     419 CBC      467195 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     420                 : {
     421          467195 :     int         cnt = 0;
     422                 :     uint32      c1,
     423                 :                 c2,
     424                 :                 c3,
     425                 :                 c4;
     426                 : 
     427        24423147 :     while (len > 0 && *from)
     428                 :     {
     429        23955952 :         if ((*from & 0x80) == 0)
     430                 :         {
     431        23955634 :             *to = *from++;
     432        23955634 :             len--;
     433                 :         }
     434             318 :         else if ((*from & 0xe0) == 0xc0)
     435                 :         {
     436             278 :             if (len < 2)
     437 UBC           0 :                 break;          /* drop trailing incomplete char */
     438 CBC         278 :             c1 = *from++ & 0x1f;
     439             278 :             c2 = *from++ & 0x3f;
     440             278 :             *to = (c1 << 6) | c2;
     441             278 :             len -= 2;
     442                 :         }
     443              40 :         else if ((*from & 0xf0) == 0xe0)
     444                 :         {
     445              40 :             if (len < 3)
     446 UBC           0 :                 break;          /* drop trailing incomplete char */
     447 CBC          40 :             c1 = *from++ & 0x0f;
     448              40 :             c2 = *from++ & 0x3f;
     449              40 :             c3 = *from++ & 0x3f;
     450              40 :             *to = (c1 << 12) | (c2 << 6) | c3;
     451              40 :             len -= 3;
     452                 :         }
     453 UBC           0 :         else if ((*from & 0xf8) == 0xf0)
     454                 :         {
     455               0 :             if (len < 4)
     456               0 :                 break;          /* drop trailing incomplete char */
     457               0 :             c1 = *from++ & 0x07;
     458               0 :             c2 = *from++ & 0x3f;
     459               0 :             c3 = *from++ & 0x3f;
     460               0 :             c4 = *from++ & 0x3f;
     461               0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     462               0 :             len -= 4;
     463                 :         }
     464                 :         else
     465                 :         {
     466                 :             /* treat a bogus char as length 1; not ours to raise error */
     467               0 :             *to = *from++;
     468               0 :             len--;
     469                 :         }
     470 CBC    23955952 :         to++;
     471        23955952 :         cnt++;
     472                 :     }
     473          467195 :     *to = 0;
     474          467195 :     return cnt;
     475                 : }
     476                 : 
     477                 : 
     478                 : /*
     479                 :  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
     480                 :  * space allocated.
     481                 :  */
     482                 : unsigned char *
     483         7818723 : unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
     484                 : {
     485         7818723 :     if (c <= 0x7F)
     486                 :     {
     487         7818439 :         utf8string[0] = c;
     488                 :     }
     489             284 :     else if (c <= 0x7FF)
     490                 :     {
     491             132 :         utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
     492             132 :         utf8string[1] = 0x80 | (c & 0x3F);
     493                 :     }
     494             152 :     else if (c <= 0xFFFF)
     495                 :     {
     496             122 :         utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
     497             122 :         utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
     498             122 :         utf8string[2] = 0x80 | (c & 0x3F);
     499                 :     }
     500                 :     else
     501                 :     {
     502              30 :         utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
     503              30 :         utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
     504              30 :         utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
     505              30 :         utf8string[3] = 0x80 | (c & 0x3F);
     506                 :     }
     507                 : 
     508         7818723 :     return utf8string;
     509                 : }
     510                 : 
     511                 : /*
     512                 :  * Trivial conversion from pg_wchar to UTF-8.
     513                 :  * caller should allocate enough space for "to"
     514                 :  * len: length of from.
     515                 :  * "from" not necessarily null terminated.
     516                 :  */
     517                 : static int
     518          555690 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     519                 : {
     520          555690 :     int         cnt = 0;
     521                 : 
     522         8374066 :     while (len > 0 && *from)
     523                 :     {
     524                 :         int         char_len;
     525                 : 
     526         7818376 :         unicode_to_utf8(*from, to);
     527         7818376 :         char_len = pg_utf_mblen(to);
     528         7818376 :         cnt += char_len;
     529         7818376 :         to += char_len;
     530         7818376 :         from++;
     531         7818376 :         len--;
     532                 :     }
     533          555690 :     *to = 0;
     534          555690 :     return cnt;
     535                 : }
     536                 : 
     537                 : /*
     538                 :  * Return the byte length of a UTF8 character pointed to by s
     539                 :  *
     540                 :  * Note: in the current implementation we do not support UTF8 sequences
     541                 :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     542                 :  * We return "1" for any leading byte that is either flat-out illegal or
     543                 :  * indicates a length larger than we support.
     544                 :  *
     545                 :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     546                 :  * other places would need to be fixed to change this.
     547                 :  */
     548                 : int
     549       144827375 : pg_utf_mblen(const unsigned char *s)
     550                 : {
     551                 :     int         len;
     552                 : 
     553       144827375 :     if ((*s & 0x80) == 0)
     554       144814332 :         len = 1;
     555           13043 :     else if ((*s & 0xe0) == 0xc0)
     556            6366 :         len = 2;
     557            6677 :     else if ((*s & 0xf0) == 0xe0)
     558            6227 :         len = 3;
     559             450 :     else if ((*s & 0xf8) == 0xf0)
     560             364 :         len = 4;
     561                 : #ifdef NOT_USED
     562                 :     else if ((*s & 0xfc) == 0xf8)
     563                 :         len = 5;
     564                 :     else if ((*s & 0xfe) == 0xfc)
     565                 :         len = 6;
     566                 : #endif
     567                 :     else
     568              86 :         len = 1;
     569       144827375 :     return len;
     570                 : }
     571                 : 
     572                 : /*
     573                 :  * This is an implementation of wcwidth() and wcswidth() as defined in
     574                 :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     575                 :  * <http://www.unix.org/online.html>
     576                 :  *
     577                 :  * Markus Kuhn -- 2001-09-08 -- public domain
     578                 :  *
     579                 :  * customised for PostgreSQL
     580                 :  *
     581                 :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     582                 :  */
     583                 : 
     584                 : struct mbinterval
     585                 : {
     586                 :     unsigned int first;
     587                 :     unsigned int last;
     588                 : };
     589                 : 
     590                 : /* auxiliary function for binary search in interval table */
     591                 : static int
     592        48447406 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     593                 : {
     594        48447406 :     int         min = 0;
     595                 :     int         mid;
     596                 : 
     597        48447406 :     if (ucs < table[0].first || ucs > table[max].last)
     598        48444614 :         return 0;
     599           24732 :     while (max >= min)
     600                 :     {
     601           22060 :         mid = (min + max) / 2;
     602           22060 :         if (ucs > table[mid].last)
     603            3644 :             min = mid + 1;
     604           18416 :         else if (ucs < table[mid].first)
     605           18296 :             max = mid - 1;
     606                 :         else
     607             120 :             return 1;
     608                 :     }
     609                 : 
     610            2672 :     return 0;
     611                 : }
     612                 : 
     613                 : 
     614                 : /* The following functions define the column width of an ISO 10646
     615                 :  * character as follows:
     616                 :  *
     617                 :  *    - The null character (U+0000) has a column width of 0.
     618                 :  *
     619                 :  *    - Other C0/C1 control characters and DEL will lead to a return
     620                 :  *      value of -1.
     621                 :  *
     622                 :  *    - Non-spacing and enclosing combining characters (general
     623                 :  *      category code Mn, Me or Cf in the Unicode database) have a
     624                 :  *      column width of 0.
     625                 :  *
     626                 :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     627                 :  *      FullWidth (F) category as defined in Unicode Technical
     628                 :  *      Report #11 have a column width of 2.
     629                 :  *
     630                 :  *    - All remaining characters (including all printable
     631                 :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     632                 :  *      etc.) have a column width of 1.
     633                 :  *
     634                 :  * This implementation assumes that wchar_t characters are encoded
     635                 :  * in ISO 10646.
     636                 :  */
     637                 : 
     638                 : static int
     639        24252996 : ucs_wcwidth(pg_wchar ucs)
     640                 : {
     641                 : #include "common/unicode_nonspacing_table.h"
     642                 : #include "common/unicode_east_asian_fw_table.h"
     643                 : 
     644                 :     /* test for 8-bit control characters */
     645        24252996 :     if (ucs == 0)
     646 UBC           0 :         return 0;
     647                 : 
     648 CBC    24252996 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     649           29257 :         return -1;
     650                 : 
     651                 :     /*
     652                 :      * binary search in table of non-spacing characters
     653                 :      *
     654                 :      * XXX: In the official Unicode sources, it is possible for a character to
     655                 :      * be described as both non-spacing and wide at the same time. As of
     656                 :      * Unicode 13.0, treating the non-spacing property as the determining
     657                 :      * factor for display width leads to the correct behavior, so do that
     658                 :      * search first.
     659                 :      */
     660 GNC    24223739 :     if (mbbisearch(ucs, nonspacing,
     661                 :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
     662 CBC          72 :         return 0;
     663                 : 
     664                 :     /* binary search in table of wide characters */
     665        24223667 :     if (mbbisearch(ucs, east_asian_fw,
     666                 :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
     667              48 :         return 2;
     668                 : 
     669        24223619 :     return 1;
     670                 : }
     671                 : 
     672                 : /*
     673                 :  * Convert a UTF-8 character to a Unicode code point.
     674                 :  * This is a one-character version of pg_utf2wchar_with_len.
     675                 :  *
     676                 :  * No error checks here, c must point to a long-enough string.
     677                 :  */
     678                 : pg_wchar
     679        24253729 : utf8_to_unicode(const unsigned char *c)
     680                 : {
     681        24253729 :     if ((*c & 0x80) == 0)
     682        24250873 :         return (pg_wchar) c[0];
     683            2856 :     else if ((*c & 0xe0) == 0xc0)
     684            2654 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
     685            2654 :                            (c[1] & 0x3f));
     686             202 :     else if ((*c & 0xf0) == 0xe0)
     687             154 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
     688             154 :                            ((c[1] & 0x3f) << 6) |
     689             154 :                            (c[2] & 0x3f));
     690              48 :     else if ((*c & 0xf8) == 0xf0)
     691              48 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
     692              48 :                            ((c[1] & 0x3f) << 12) |
     693              48 :                            ((c[2] & 0x3f) << 6) |
     694              48 :                            (c[3] & 0x3f));
     695                 :     else
     696                 :         /* that is an invalid code on purpose */
     697 UBC           0 :         return 0xffffffff;
     698                 : }
     699                 : 
     700                 : static int
     701 CBC    24252996 : pg_utf_dsplen(const unsigned char *s)
     702                 : {
     703        24252996 :     return ucs_wcwidth(utf8_to_unicode(s));
     704                 : }
     705                 : 
     706                 : /*
     707                 :  * convert mule internal code to pg_wchar
     708                 :  * caller should allocate enough space for "to"
     709                 :  * len: length of from.
     710                 :  * "from" not necessarily null terminated.
     711                 :  */
     712                 : static int
     713 UBC           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     714                 : {
     715               0 :     int         cnt = 0;
     716                 : 
     717               0 :     while (len > 0 && *from)
     718                 :     {
     719               0 :         if (IS_LC1(*from) && len >= 2)
     720                 :         {
     721               0 :             *to = *from++ << 16;
     722               0 :             *to |= *from++;
     723               0 :             len -= 2;
     724                 :         }
     725               0 :         else if (IS_LCPRV1(*from) && len >= 3)
     726                 :         {
     727               0 :             from++;
     728               0 :             *to = *from++ << 16;
     729               0 :             *to |= *from++;
     730               0 :             len -= 3;
     731                 :         }
     732               0 :         else if (IS_LC2(*from) && len >= 3)
     733                 :         {
     734               0 :             *to = *from++ << 16;
     735               0 :             *to |= *from++ << 8;
     736               0 :             *to |= *from++;
     737               0 :             len -= 3;
     738                 :         }
     739               0 :         else if (IS_LCPRV2(*from) && len >= 4)
     740                 :         {
     741               0 :             from++;
     742               0 :             *to = *from++ << 16;
     743               0 :             *to |= *from++ << 8;
     744               0 :             *to |= *from++;
     745               0 :             len -= 4;
     746                 :         }
     747                 :         else
     748                 :         {                       /* assume ASCII */
     749               0 :             *to = (unsigned char) *from++;
     750               0 :             len--;
     751                 :         }
     752               0 :         to++;
     753               0 :         cnt++;
     754                 :     }
     755               0 :     *to = 0;
     756               0 :     return cnt;
     757                 : }
     758                 : 
     759                 : /*
     760                 :  * convert pg_wchar to mule internal code
     761                 :  * caller should allocate enough space for "to"
     762                 :  * len: length of from.
     763                 :  * "from" not necessarily null terminated.
     764                 :  */
     765                 : static int
     766               0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     767                 : {
     768               0 :     int         cnt = 0;
     769                 : 
     770               0 :     while (len > 0 && *from)
     771                 :     {
     772                 :         unsigned char lb;
     773                 : 
     774               0 :         lb = (*from >> 16) & 0xff;
     775               0 :         if (IS_LC1(lb))
     776                 :         {
     777               0 :             *to++ = lb;
     778               0 :             *to++ = *from & 0xff;
     779               0 :             cnt += 2;
     780                 :         }
     781               0 :         else if (IS_LC2(lb))
     782                 :         {
     783               0 :             *to++ = lb;
     784               0 :             *to++ = (*from >> 8) & 0xff;
     785               0 :             *to++ = *from & 0xff;
     786               0 :             cnt += 3;
     787                 :         }
     788               0 :         else if (IS_LCPRV1_A_RANGE(lb))
     789                 :         {
     790               0 :             *to++ = LCPRV1_A;
     791               0 :             *to++ = lb;
     792               0 :             *to++ = *from & 0xff;
     793               0 :             cnt += 3;
     794                 :         }
     795               0 :         else if (IS_LCPRV1_B_RANGE(lb))
     796                 :         {
     797               0 :             *to++ = LCPRV1_B;
     798               0 :             *to++ = lb;
     799               0 :             *to++ = *from & 0xff;
     800               0 :             cnt += 3;
     801                 :         }
     802               0 :         else if (IS_LCPRV2_A_RANGE(lb))
     803                 :         {
     804               0 :             *to++ = LCPRV2_A;
     805               0 :             *to++ = lb;
     806               0 :             *to++ = (*from >> 8) & 0xff;
     807               0 :             *to++ = *from & 0xff;
     808               0 :             cnt += 4;
     809                 :         }
     810               0 :         else if (IS_LCPRV2_B_RANGE(lb))
     811                 :         {
     812               0 :             *to++ = LCPRV2_B;
     813               0 :             *to++ = lb;
     814               0 :             *to++ = (*from >> 8) & 0xff;
     815               0 :             *to++ = *from & 0xff;
     816               0 :             cnt += 4;
     817                 :         }
     818                 :         else
     819                 :         {
     820               0 :             *to++ = *from & 0xff;
     821               0 :             cnt += 1;
     822                 :         }
     823               0 :         from++;
     824               0 :         len--;
     825                 :     }
     826               0 :     *to = 0;
     827               0 :     return cnt;
     828                 : }
     829                 : 
     830                 : /* exported for direct use by conv.c */
     831                 : int
     832 CBC        1476 : pg_mule_mblen(const unsigned char *s)
     833                 : {
     834                 :     int         len;
     835                 : 
     836            1476 :     if (IS_LC1(*s))
     837             594 :         len = 2;
     838             882 :     else if (IS_LCPRV1(*s))
     839 UBC           0 :         len = 3;
     840 CBC         882 :     else if (IS_LC2(*s))
     841             855 :         len = 3;
     842              27 :     else if (IS_LCPRV2(*s))
     843 UBC           0 :         len = 4;
     844                 :     else
     845 CBC          27 :         len = 1;                /* assume ASCII */
     846            1476 :     return len;
     847                 : }
     848                 : 
     849                 : static int
     850 UBC           0 : pg_mule_dsplen(const unsigned char *s)
     851                 : {
     852                 :     int         len;
     853                 : 
     854                 :     /*
     855                 :      * Note: it's not really appropriate to assume that all multibyte charsets
     856                 :      * are double-wide on screen.  But this seems an okay approximation for
     857                 :      * the MULE charsets we currently support.
     858                 :      */
     859                 : 
     860               0 :     if (IS_LC1(*s))
     861               0 :         len = 1;
     862               0 :     else if (IS_LCPRV1(*s))
     863               0 :         len = 1;
     864               0 :     else if (IS_LC2(*s))
     865               0 :         len = 2;
     866               0 :     else if (IS_LCPRV2(*s))
     867               0 :         len = 2;
     868                 :     else
     869               0 :         len = 1;                /* assume ASCII */
     870                 : 
     871               0 :     return len;
     872                 : }
     873                 : 
     874                 : /*
     875                 :  * ISO8859-1
     876                 :  */
     877                 : static int
     878 CBC         507 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     879                 : {
     880             507 :     int         cnt = 0;
     881                 : 
     882           14193 :     while (len > 0 && *from)
     883                 :     {
     884           13686 :         *to++ = *from++;
     885           13686 :         len--;
     886           13686 :         cnt++;
     887                 :     }
     888             507 :     *to = 0;
     889             507 :     return cnt;
     890                 : }
     891                 : 
     892                 : /*
     893                 :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     894                 :  * high bits.
     895                 :  * caller should allocate enough space for "to"
     896                 :  * len: length of from.
     897                 :  * "from" not necessarily null terminated.
     898                 :  */
     899                 : static int
     900              51 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     901                 : {
     902              51 :     int         cnt = 0;
     903                 : 
     904             396 :     while (len > 0 && *from)
     905                 :     {
     906             345 :         *to++ = *from++;
     907             345 :         len--;
     908             345 :         cnt++;
     909                 :     }
     910              51 :     *to = 0;
     911              51 :     return cnt;
     912                 : }
     913                 : 
     914                 : static int
     915            2158 : pg_latin1_mblen(const unsigned char *s)
     916                 : {
     917            2158 :     return 1;
     918                 : }
     919                 : 
     920                 : static int
     921             400 : pg_latin1_dsplen(const unsigned char *s)
     922                 : {
     923             400 :     return pg_ascii_dsplen(s);
     924                 : }
     925                 : 
     926                 : /*
     927                 :  * SJIS
     928                 :  */
     929                 : static int
     930             486 : pg_sjis_mblen(const unsigned char *s)
     931                 : {
     932                 :     int         len;
     933                 : 
     934             486 :     if (*s >= 0xa1 && *s <= 0xdf)
     935 UBC           0 :         len = 1;                /* 1 byte kana? */
     936 CBC         486 :     else if (IS_HIGHBIT_SET(*s))
     937             432 :         len = 2;                /* kanji? */
     938                 :     else
     939              54 :         len = 1;                /* should be ASCII */
     940             486 :     return len;
     941                 : }
     942                 : 
     943                 : static int
     944 UBC           0 : pg_sjis_dsplen(const unsigned char *s)
     945                 : {
     946                 :     int         len;
     947                 : 
     948               0 :     if (*s >= 0xa1 && *s <= 0xdf)
     949               0 :         len = 1;                /* 1 byte kana? */
     950               0 :     else if (IS_HIGHBIT_SET(*s))
     951               0 :         len = 2;                /* kanji? */
     952                 :     else
     953               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     954               0 :     return len;
     955                 : }
     956                 : 
     957                 : /*
     958                 :  * Big5
     959                 :  */
     960                 : static int
     961 CBC         234 : pg_big5_mblen(const unsigned char *s)
     962                 : {
     963                 :     int         len;
     964                 : 
     965             234 :     if (IS_HIGHBIT_SET(*s))
     966             207 :         len = 2;                /* kanji? */
     967                 :     else
     968              27 :         len = 1;                /* should be ASCII */
     969             234 :     return len;
     970                 : }
     971                 : 
     972                 : static int
     973 UBC           0 : pg_big5_dsplen(const unsigned char *s)
     974                 : {
     975                 :     int         len;
     976                 : 
     977               0 :     if (IS_HIGHBIT_SET(*s))
     978               0 :         len = 2;                /* kanji? */
     979                 :     else
     980               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     981               0 :     return len;
     982                 : }
     983                 : 
     984                 : /*
     985                 :  * GBK
     986                 :  */
     987                 : static int
     988               0 : pg_gbk_mblen(const unsigned char *s)
     989                 : {
     990                 :     int         len;
     991                 : 
     992               0 :     if (IS_HIGHBIT_SET(*s))
     993               0 :         len = 2;                /* kanji? */
     994                 :     else
     995               0 :         len = 1;                /* should be ASCII */
     996               0 :     return len;
     997                 : }
     998                 : 
     999                 : static int
    1000               0 : pg_gbk_dsplen(const unsigned char *s)
    1001                 : {
    1002                 :     int         len;
    1003                 : 
    1004               0 :     if (IS_HIGHBIT_SET(*s))
    1005               0 :         len = 2;                /* kanji? */
    1006                 :     else
    1007               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1008               0 :     return len;
    1009                 : }
    1010                 : 
    1011                 : /*
    1012                 :  * UHC
    1013                 :  */
    1014                 : static int
    1015               0 : pg_uhc_mblen(const unsigned char *s)
    1016                 : {
    1017                 :     int         len;
    1018                 : 
    1019               0 :     if (IS_HIGHBIT_SET(*s))
    1020               0 :         len = 2;                /* 2byte? */
    1021                 :     else
    1022               0 :         len = 1;                /* should be ASCII */
    1023               0 :     return len;
    1024                 : }
    1025                 : 
    1026                 : static int
    1027               0 : pg_uhc_dsplen(const unsigned char *s)
    1028                 : {
    1029                 :     int         len;
    1030                 : 
    1031               0 :     if (IS_HIGHBIT_SET(*s))
    1032               0 :         len = 2;                /* 2byte? */
    1033                 :     else
    1034               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1035               0 :     return len;
    1036                 : }
    1037                 : 
    1038                 : /*
    1039                 :  * GB18030
    1040                 :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1041                 :  */
    1042                 : 
    1043                 : /*
    1044                 :  * Unlike all other mblen() functions, this also looks at the second byte of
    1045                 :  * the input.  However, if you only pass the first byte of a multi-byte
    1046                 :  * string, and \0 as the second byte, this still works in a predictable way:
    1047                 :  * a 4-byte character will be reported as two 2-byte characters.  That's
    1048                 :  * enough for all current uses, as a client-only encoding.  It works that
    1049                 :  * way, because in any valid 4-byte GB18030-encoded character, the third and
    1050                 :  * fourth byte look like a 2-byte encoded character, when looked at
    1051                 :  * separately.
    1052                 :  */
    1053                 : static int
    1054 CBC          81 : pg_gb18030_mblen(const unsigned char *s)
    1055                 : {
    1056                 :     int         len;
    1057                 : 
    1058              81 :     if (!IS_HIGHBIT_SET(*s))
    1059              18 :         len = 1;                /* ASCII */
    1060              63 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1061              63 :         len = 4;
    1062                 :     else
    1063 UBC           0 :         len = 2;
    1064 CBC          81 :     return len;
    1065                 : }
    1066                 : 
    1067                 : static int
    1068 UBC           0 : pg_gb18030_dsplen(const unsigned char *s)
    1069                 : {
    1070                 :     int         len;
    1071                 : 
    1072               0 :     if (IS_HIGHBIT_SET(*s))
    1073               0 :         len = 2;
    1074                 :     else
    1075               0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1076               0 :     return len;
    1077                 : }
    1078                 : 
    1079                 : /*
    1080                 :  *-------------------------------------------------------------------
    1081                 :  * multibyte sequence validators
    1082                 :  *
    1083                 :  * The verifychar functions accept "s", a pointer to the first byte of a
    1084                 :  * string, and "len", the remaining length of the string.  If there is a
    1085                 :  * validly encoded character beginning at *s, return its length in bytes;
    1086                 :  * else return -1.
    1087                 :  *
    1088                 :  * The verifystr functions also accept "s", a pointer to a string and "len",
    1089                 :  * the length of the string.  They verify the whole string, and return the
    1090                 :  * number of input bytes (<= len) that are valid.  In other words, if the
    1091                 :  * whole string is valid, verifystr returns "len", otherwise it returns the
    1092                 :  * byte offset of the first invalid character.  The verifystr functions must
    1093                 :  * test for and reject zeroes in the input.
    1094                 :  *
    1095                 :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
    1096                 :  * they must test for and reject zeroes in any additional bytes of a
    1097                 :  * multibyte character.  Note that this definition allows the function for a
    1098                 :  * single-byte encoding to be just "return 1".
    1099                 :  *-------------------------------------------------------------------
    1100                 :  */
    1101                 : static int
    1102 CBC          28 : pg_ascii_verifychar(const unsigned char *s, int len)
    1103                 : {
    1104              28 :     return 1;
    1105                 : }
    1106                 : 
    1107                 : static int
    1108            2281 : pg_ascii_verifystr(const unsigned char *s, int len)
    1109                 : {
    1110            2281 :     const unsigned char *nullpos = memchr(s, 0, len);
    1111                 : 
    1112            2281 :     if (nullpos == NULL)
    1113            2281 :         return len;
    1114                 :     else
    1115 UBC           0 :         return nullpos - s;
    1116                 : }
    1117                 : 
    1118                 : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1119                 : 
    1120                 : static int
    1121 CBC         216 : pg_eucjp_verifychar(const unsigned char *s, int len)
    1122                 : {
    1123                 :     int         l;
    1124                 :     unsigned char c1,
    1125                 :                 c2;
    1126                 : 
    1127             216 :     c1 = *s++;
    1128                 : 
    1129             216 :     switch (c1)
    1130                 :     {
    1131 UBC           0 :         case SS2:               /* JIS X 0201 */
    1132               0 :             l = 2;
    1133               0 :             if (l > len)
    1134               0 :                 return -1;
    1135               0 :             c2 = *s++;
    1136               0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1137               0 :                 return -1;
    1138               0 :             break;
    1139                 : 
    1140               0 :         case SS3:               /* JIS X 0212 */
    1141               0 :             l = 3;
    1142               0 :             if (l > len)
    1143               0 :                 return -1;
    1144               0 :             c2 = *s++;
    1145               0 :             if (!IS_EUC_RANGE_VALID(c2))
    1146               0 :                 return -1;
    1147               0 :             c2 = *s++;
    1148               0 :             if (!IS_EUC_RANGE_VALID(c2))
    1149               0 :                 return -1;
    1150               0 :             break;
    1151                 : 
    1152 CBC         216 :         default:
    1153             216 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1154                 :             {
    1155             216 :                 l = 2;
    1156             216 :                 if (l > len)
    1157              36 :                     return -1;
    1158             180 :                 if (!IS_EUC_RANGE_VALID(c1))
    1159 UBC           0 :                     return -1;
    1160 CBC         180 :                 c2 = *s++;
    1161             180 :                 if (!IS_EUC_RANGE_VALID(c2))
    1162              72 :                     return -1;
    1163                 :             }
    1164                 :             else
    1165                 :                 /* must be ASCII */
    1166                 :             {
    1167 UBC           0 :                 l = 1;
    1168                 :             }
    1169 CBC         108 :             break;
    1170                 :     }
    1171                 : 
    1172             108 :     return l;
    1173                 : }
    1174                 : 
    1175                 : static int
    1176             132 : pg_eucjp_verifystr(const unsigned char *s, int len)
    1177                 : {
    1178             132 :     const unsigned char *start = s;
    1179                 : 
    1180             447 :     while (len > 0)
    1181                 :     {
    1182                 :         int         l;
    1183                 : 
    1184                 :         /* fast path for ASCII-subset characters */
    1185             405 :         if (!IS_HIGHBIT_SET(*s))
    1186                 :         {
    1187             297 :             if (*s == '\0')
    1188              36 :                 break;
    1189             261 :             l = 1;
    1190                 :         }
    1191                 :         else
    1192                 :         {
    1193             108 :             l = pg_eucjp_verifychar(s, len);
    1194             108 :             if (l == -1)
    1195              54 :                 break;
    1196                 :         }
    1197             315 :         s += l;
    1198             315 :         len -= l;
    1199                 :     }
    1200                 : 
    1201             132 :     return s - start;
    1202                 : }
    1203                 : 
    1204                 : static int
    1205 UBC           0 : pg_euckr_verifychar(const unsigned char *s, int len)
    1206                 : {
    1207                 :     int         l;
    1208                 :     unsigned char c1,
    1209                 :                 c2;
    1210                 : 
    1211               0 :     c1 = *s++;
    1212                 : 
    1213               0 :     if (IS_HIGHBIT_SET(c1))
    1214                 :     {
    1215               0 :         l = 2;
    1216               0 :         if (l > len)
    1217               0 :             return -1;
    1218               0 :         if (!IS_EUC_RANGE_VALID(c1))
    1219               0 :             return -1;
    1220               0 :         c2 = *s++;
    1221               0 :         if (!IS_EUC_RANGE_VALID(c2))
    1222               0 :             return -1;
    1223                 :     }
    1224                 :     else
    1225                 :         /* must be ASCII */
    1226                 :     {
    1227               0 :         l = 1;
    1228                 :     }
    1229                 : 
    1230               0 :     return l;
    1231                 : }
    1232                 : 
    1233                 : static int
    1234 CBC          12 : pg_euckr_verifystr(const unsigned char *s, int len)
    1235                 : {
    1236              12 :     const unsigned char *start = s;
    1237                 : 
    1238              48 :     while (len > 0)
    1239                 :     {
    1240                 :         int         l;
    1241                 : 
    1242                 :         /* fast path for ASCII-subset characters */
    1243              36 :         if (!IS_HIGHBIT_SET(*s))
    1244                 :         {
    1245              36 :             if (*s == '\0')
    1246 UBC           0 :                 break;
    1247 CBC          36 :             l = 1;
    1248                 :         }
    1249                 :         else
    1250                 :         {
    1251 UBC           0 :             l = pg_euckr_verifychar(s, len);
    1252               0 :             if (l == -1)
    1253               0 :                 break;
    1254                 :         }
    1255 CBC          36 :         s += l;
    1256              36 :         len -= l;
    1257                 :     }
    1258                 : 
    1259              12 :     return s - start;
    1260                 : }
    1261                 : 
    1262                 : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1263                 : #define pg_euccn_verifychar pg_euckr_verifychar
    1264                 : #define pg_euccn_verifystr  pg_euckr_verifystr
    1265                 : 
    1266                 : static int
    1267 UBC           0 : pg_euctw_verifychar(const unsigned char *s, int len)
    1268                 : {
    1269                 :     int         l;
    1270                 :     unsigned char c1,
    1271                 :                 c2;
    1272                 : 
    1273               0 :     c1 = *s++;
    1274                 : 
    1275               0 :     switch (c1)
    1276                 :     {
    1277               0 :         case SS2:               /* CNS 11643 Plane 1-7 */
    1278               0 :             l = 4;
    1279               0 :             if (l > len)
    1280               0 :                 return -1;
    1281               0 :             c2 = *s++;
    1282               0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1283               0 :                 return -1;
    1284               0 :             c2 = *s++;
    1285               0 :             if (!IS_EUC_RANGE_VALID(c2))
    1286               0 :                 return -1;
    1287               0 :             c2 = *s++;
    1288               0 :             if (!IS_EUC_RANGE_VALID(c2))
    1289               0 :                 return -1;
    1290               0 :             break;
    1291                 : 
    1292               0 :         case SS3:               /* unused */
    1293               0 :             return -1;
    1294                 : 
    1295               0 :         default:
    1296               0 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1297                 :             {
    1298               0 :                 l = 2;
    1299               0 :                 if (l > len)
    1300               0 :                     return -1;
    1301                 :                 /* no further range check on c1? */
    1302               0 :                 c2 = *s++;
    1303               0 :                 if (!IS_EUC_RANGE_VALID(c2))
    1304               0 :                     return -1;
    1305                 :             }
    1306                 :             else
    1307                 :                 /* must be ASCII */
    1308                 :             {
    1309               0 :                 l = 1;
    1310                 :             }
    1311               0 :             break;
    1312                 :     }
    1313               0 :     return l;
    1314                 : }
    1315                 : 
    1316                 : static int
    1317 CBC           9 : pg_euctw_verifystr(const unsigned char *s, int len)
    1318                 : {
    1319               9 :     const unsigned char *start = s;
    1320                 : 
    1321              36 :     while (len > 0)
    1322                 :     {
    1323                 :         int         l;
    1324                 : 
    1325                 :         /* fast path for ASCII-subset characters */
    1326              27 :         if (!IS_HIGHBIT_SET(*s))
    1327                 :         {
    1328              27 :             if (*s == '\0')
    1329 UBC           0 :                 break;
    1330 CBC          27 :             l = 1;
    1331                 :         }
    1332                 :         else
    1333                 :         {
    1334 UBC           0 :             l = pg_euctw_verifychar(s, len);
    1335               0 :             if (l == -1)
    1336               0 :                 break;
    1337                 :         }
    1338 CBC          27 :         s += l;
    1339              27 :         len -= l;
    1340                 :     }
    1341                 : 
    1342               9 :     return s - start;
    1343                 : }
    1344                 : 
    1345                 : static int
    1346 UBC           0 : pg_johab_verifychar(const unsigned char *s, int len)
    1347                 : {
    1348                 :     int         l,
    1349                 :                 mbl;
    1350                 :     unsigned char c;
    1351                 : 
    1352               0 :     l = mbl = pg_johab_mblen(s);
    1353                 : 
    1354               0 :     if (len < l)
    1355               0 :         return -1;
    1356                 : 
    1357               0 :     if (!IS_HIGHBIT_SET(*s))
    1358               0 :         return mbl;
    1359                 : 
    1360               0 :     while (--l > 0)
    1361                 :     {
    1362               0 :         c = *++s;
    1363               0 :         if (!IS_EUC_RANGE_VALID(c))
    1364               0 :             return -1;
    1365                 :     }
    1366               0 :     return mbl;
    1367                 : }
    1368                 : 
    1369                 : static int
    1370 CBC           3 : pg_johab_verifystr(const unsigned char *s, int len)
    1371                 : {
    1372               3 :     const unsigned char *start = s;
    1373                 : 
    1374              12 :     while (len > 0)
    1375                 :     {
    1376                 :         int         l;
    1377                 : 
    1378                 :         /* fast path for ASCII-subset characters */
    1379               9 :         if (!IS_HIGHBIT_SET(*s))
    1380                 :         {
    1381               9 :             if (*s == '\0')
    1382 UBC           0 :                 break;
    1383 CBC           9 :             l = 1;
    1384                 :         }
    1385                 :         else
    1386                 :         {
    1387 UBC           0 :             l = pg_johab_verifychar(s, len);
    1388               0 :             if (l == -1)
    1389               0 :                 break;
    1390                 :         }
    1391 CBC           9 :         s += l;
    1392               9 :         len -= l;
    1393                 :     }
    1394                 : 
    1395               3 :     return s - start;
    1396                 : }
    1397                 : 
    1398                 : static int
    1399             648 : pg_mule_verifychar(const unsigned char *s, int len)
    1400                 : {
    1401                 :     int         l,
    1402                 :                 mbl;
    1403                 :     unsigned char c;
    1404                 : 
    1405             648 :     l = mbl = pg_mule_mblen(s);
    1406                 : 
    1407             648 :     if (len < l)
    1408             162 :         return -1;
    1409                 : 
    1410             999 :     while (--l > 0)
    1411                 :     {
    1412             657 :         c = *++s;
    1413             657 :         if (!IS_HIGHBIT_SET(c))
    1414             144 :             return -1;
    1415                 :     }
    1416             342 :     return mbl;
    1417                 : }
    1418                 : 
    1419                 : static int
    1420             189 : pg_mule_verifystr(const unsigned char *s, int len)
    1421                 : {
    1422             189 :     const unsigned char *start = s;
    1423                 : 
    1424             531 :     while (len > 0)
    1425                 :     {
    1426                 :         int         l;
    1427                 : 
    1428                 :         /* fast path for ASCII-subset characters */
    1429             450 :         if (!IS_HIGHBIT_SET(*s))
    1430                 :         {
    1431             261 :             if (*s == '\0')
    1432              18 :                 break;
    1433             243 :             l = 1;
    1434                 :         }
    1435                 :         else
    1436                 :         {
    1437             189 :             l = pg_mule_verifychar(s, len);
    1438             189 :             if (l == -1)
    1439              90 :                 break;
    1440                 :         }
    1441             342 :         s += l;
    1442             342 :         len -= l;
    1443                 :     }
    1444                 : 
    1445             189 :     return s - start;
    1446                 : }
    1447                 : 
    1448                 : static int
    1449             167 : pg_latin1_verifychar(const unsigned char *s, int len)
    1450                 : {
    1451             167 :     return 1;
    1452                 : }
    1453                 : 
    1454                 : static int
    1455            5416 : pg_latin1_verifystr(const unsigned char *s, int len)
    1456                 : {
    1457            5416 :     const unsigned char *nullpos = memchr(s, 0, len);
    1458                 : 
    1459            5416 :     if (nullpos == NULL)
    1460            5362 :         return len;
    1461                 :     else
    1462              54 :         return nullpos - s;
    1463                 : }
    1464                 : 
    1465                 : static int
    1466             351 : pg_sjis_verifychar(const unsigned char *s, int len)
    1467                 : {
    1468                 :     int         l,
    1469                 :                 mbl;
    1470                 :     unsigned char c1,
    1471                 :                 c2;
    1472                 : 
    1473             351 :     l = mbl = pg_sjis_mblen(s);
    1474                 : 
    1475             351 :     if (len < l)
    1476              54 :         return -1;
    1477                 : 
    1478             297 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1479 UBC           0 :         return mbl;
    1480                 : 
    1481 CBC         297 :     c1 = *s++;
    1482             297 :     c2 = *s;
    1483             297 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1484             108 :         return -1;
    1485             189 :     return mbl;
    1486                 : }
    1487                 : 
    1488                 : static int
    1489             141 : pg_sjis_verifystr(const unsigned char *s, int len)
    1490                 : {
    1491             141 :     const unsigned char *start = s;
    1492                 : 
    1493             627 :     while (len > 0)
    1494                 :     {
    1495                 :         int         l;
    1496                 : 
    1497                 :         /* fast path for ASCII-subset characters */
    1498             576 :         if (!IS_HIGHBIT_SET(*s))
    1499                 :         {
    1500             459 :             if (*s == '\0')
    1501              36 :                 break;
    1502             423 :             l = 1;
    1503                 :         }
    1504                 :         else
    1505                 :         {
    1506             117 :             l = pg_sjis_verifychar(s, len);
    1507             117 :             if (l == -1)
    1508              54 :                 break;
    1509                 :         }
    1510             486 :         s += l;
    1511             486 :         len -= l;
    1512                 :     }
    1513                 : 
    1514             141 :     return s - start;
    1515                 : }
    1516                 : 
    1517                 : static int
    1518             171 : pg_big5_verifychar(const unsigned char *s, int len)
    1519                 : {
    1520                 :     int         l,
    1521                 :                 mbl;
    1522                 : 
    1523             171 :     l = mbl = pg_big5_mblen(s);
    1524                 : 
    1525             171 :     if (len < l)
    1526 UBC           0 :         return -1;
    1527                 : 
    1528 CBC         288 :     while (--l > 0)
    1529                 :     {
    1530             171 :         if (*++s == '\0')
    1531              54 :             return -1;
    1532                 :     }
    1533                 : 
    1534             117 :     return mbl;
    1535                 : }
    1536                 : 
    1537                 : static int
    1538              72 : pg_big5_verifystr(const unsigned char *s, int len)
    1539                 : {
    1540              72 :     const unsigned char *start = s;
    1541                 : 
    1542             324 :     while (len > 0)
    1543                 :     {
    1544                 :         int         l;
    1545                 : 
    1546                 :         /* fast path for ASCII-subset characters */
    1547             288 :         if (!IS_HIGHBIT_SET(*s))
    1548                 :         {
    1549             234 :             if (*s == '\0')
    1550              18 :                 break;
    1551             216 :             l = 1;
    1552                 :         }
    1553                 :         else
    1554                 :         {
    1555              54 :             l = pg_big5_verifychar(s, len);
    1556              54 :             if (l == -1)
    1557              18 :                 break;
    1558                 :         }
    1559             252 :         s += l;
    1560             252 :         len -= l;
    1561                 :     }
    1562                 : 
    1563              72 :     return s - start;
    1564                 : }
    1565                 : 
    1566                 : static int
    1567 UBC           0 : pg_gbk_verifychar(const unsigned char *s, int len)
    1568                 : {
    1569                 :     int         l,
    1570                 :                 mbl;
    1571                 : 
    1572               0 :     l = mbl = pg_gbk_mblen(s);
    1573                 : 
    1574               0 :     if (len < l)
    1575               0 :         return -1;
    1576                 : 
    1577               0 :     while (--l > 0)
    1578                 :     {
    1579               0 :         if (*++s == '\0')
    1580               0 :             return -1;
    1581                 :     }
    1582                 : 
    1583               0 :     return mbl;
    1584                 : }
    1585                 : 
    1586                 : static int
    1587 CBC           3 : pg_gbk_verifystr(const unsigned char *s, int len)
    1588                 : {
    1589               3 :     const unsigned char *start = s;
    1590                 : 
    1591              12 :     while (len > 0)
    1592                 :     {
    1593                 :         int         l;
    1594                 : 
    1595                 :         /* fast path for ASCII-subset characters */
    1596               9 :         if (!IS_HIGHBIT_SET(*s))
    1597                 :         {
    1598               9 :             if (*s == '\0')
    1599 UBC           0 :                 break;
    1600 CBC           9 :             l = 1;
    1601                 :         }
    1602                 :         else
    1603                 :         {
    1604 UBC           0 :             l = pg_gbk_verifychar(s, len);
    1605               0 :             if (l == -1)
    1606               0 :                 break;
    1607                 :         }
    1608 CBC           9 :         s += l;
    1609               9 :         len -= l;
    1610                 :     }
    1611                 : 
    1612               3 :     return s - start;
    1613                 : }
    1614                 : 
    1615                 : static int
    1616 UBC           0 : pg_uhc_verifychar(const unsigned char *s, int len)
    1617                 : {
    1618                 :     int         l,
    1619                 :                 mbl;
    1620                 : 
    1621               0 :     l = mbl = pg_uhc_mblen(s);
    1622                 : 
    1623               0 :     if (len < l)
    1624               0 :         return -1;
    1625                 : 
    1626               0 :     while (--l > 0)
    1627                 :     {
    1628               0 :         if (*++s == '\0')
    1629               0 :             return -1;
    1630                 :     }
    1631                 : 
    1632               0 :     return mbl;
    1633                 : }
    1634                 : 
    1635                 : static int
    1636 CBC           3 : pg_uhc_verifystr(const unsigned char *s, int len)
    1637                 : {
    1638               3 :     const unsigned char *start = s;
    1639                 : 
    1640              12 :     while (len > 0)
    1641                 :     {
    1642                 :         int         l;
    1643                 : 
    1644                 :         /* fast path for ASCII-subset characters */
    1645               9 :         if (!IS_HIGHBIT_SET(*s))
    1646                 :         {
    1647               9 :             if (*s == '\0')
    1648 UBC           0 :                 break;
    1649 CBC           9 :             l = 1;
    1650                 :         }
    1651                 :         else
    1652                 :         {
    1653 UBC           0 :             l = pg_uhc_verifychar(s, len);
    1654               0 :             if (l == -1)
    1655               0 :                 break;
    1656                 :         }
    1657 CBC           9 :         s += l;
    1658               9 :         len -= l;
    1659                 :     }
    1660                 : 
    1661               3 :     return s - start;
    1662                 : }
    1663                 : 
    1664                 : static int
    1665             207 : pg_gb18030_verifychar(const unsigned char *s, int len)
    1666                 : {
    1667                 :     int         l;
    1668                 : 
    1669             207 :     if (!IS_HIGHBIT_SET(*s))
    1670 UBC           0 :         l = 1;                  /* ASCII */
    1671 CBC         207 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1672                 :     {
    1673                 :         /* Should be 4-byte, validate remaining bytes */
    1674             153 :         if (*s >= 0x81 && *s <= 0xfe &&
    1675             153 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1676             153 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1677              81 :             l = 4;
    1678                 :         else
    1679              72 :             l = -1;
    1680                 :     }
    1681              54 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1682                 :     {
    1683                 :         /* Should be 2-byte, validate */
    1684              54 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1685              54 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1686              18 :             l = 2;
    1687                 :         else
    1688              36 :             l = -1;
    1689                 :     }
    1690                 :     else
    1691 UBC           0 :         l = -1;
    1692 CBC         207 :     return l;
    1693                 : }
    1694                 : 
    1695                 : static int
    1696             111 : pg_gb18030_verifystr(const unsigned char *s, int len)
    1697                 : {
    1698             111 :     const unsigned char *start = s;
    1699                 : 
    1700             489 :     while (len > 0)
    1701                 :     {
    1702                 :         int         l;
    1703                 : 
    1704                 :         /* fast path for ASCII-subset characters */
    1705             450 :         if (!IS_HIGHBIT_SET(*s))
    1706                 :         {
    1707             351 :             if (*s == '\0')
    1708              18 :                 break;
    1709             333 :             l = 1;
    1710                 :         }
    1711                 :         else
    1712                 :         {
    1713              99 :             l = pg_gb18030_verifychar(s, len);
    1714              99 :             if (l == -1)
    1715              54 :                 break;
    1716                 :         }
    1717             378 :         s += l;
    1718             378 :         len -= l;
    1719                 :     }
    1720                 : 
    1721             111 :     return s - start;
    1722                 : }
    1723                 : 
    1724                 : static int
    1725            5178 : pg_utf8_verifychar(const unsigned char *s, int len)
    1726                 : {
    1727                 :     int         l;
    1728                 : 
    1729            5178 :     if ((*s & 0x80) == 0)
    1730                 :     {
    1731 UBC           0 :         if (*s == '\0')
    1732               0 :             return -1;
    1733               0 :         return 1;
    1734                 :     }
    1735 CBC        5178 :     else if ((*s & 0xe0) == 0xc0)
    1736            1918 :         l = 2;
    1737            3260 :     else if ((*s & 0xf0) == 0xe0)
    1738            2742 :         l = 3;
    1739             518 :     else if ((*s & 0xf8) == 0xf0)
    1740             386 :         l = 4;
    1741                 :     else
    1742             132 :         l = 1;
    1743                 : 
    1744            5178 :     if (l > len)
    1745              90 :         return -1;
    1746                 : 
    1747            5088 :     if (!pg_utf8_islegal(s, l))
    1748             906 :         return -1;
    1749                 : 
    1750            4182 :     return l;
    1751                 : }
    1752                 : 
    1753                 : /*
    1754                 :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
    1755                 :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
    1756                 :  * input byte and current state are used to compute an index into an array of
    1757                 :  * state transitions. Since the address of the next transition is dependent
    1758                 :  * on this computation, there is latency in executing the load instruction,
    1759                 :  * and the CPU is not kept busy.
    1760                 :  *
    1761                 :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
    1762                 :  *
    1763                 :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
    1764                 :  *
    1765                 :  * In a shift-based DFA, the input byte is an index into array of integers
    1766                 :  * whose bit pattern encodes the state transitions. To compute the next
    1767                 :  * state, we simply right-shift the integer by the current state and apply a
    1768                 :  * mask. In this scheme, the address of the transition only depends on the
    1769                 :  * input byte, so there is better pipelining.
    1770                 :  *
    1771                 :  * The naming convention for states and transitions was adopted from a UTF-8
    1772                 :  * to UTF-16/32 transcoder, whose table is reproduced below:
    1773                 :  *
    1774                 :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
    1775                 :  *
    1776                 :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
    1777                 :  * ==========================================================================
    1778                 :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
    1779                 :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
    1780                 :  *                                                                  |
    1781                 :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
    1782                 :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
    1783                 :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
    1784                 :  *                                                                  |
    1785                 :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
    1786                 :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
    1787                 :  *                                                                  |
    1788                 :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
    1789                 :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
    1790                 :  *
    1791                 :  * In the most straightforward implementation, a shift-based DFA for UTF-8
    1792                 :  * requires 64-bit integers to encode the transitions, but with an SMT solver
    1793                 :  * it's possible to find state numbers such that the transitions fit within
    1794                 :  * 32-bit integers, as Dougall Johnson demonstrated:
    1795                 :  *
    1796                 :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
    1797                 :  *
    1798                 :  * This packed representation is the reason for the seemingly odd choice of
    1799                 :  * state values below.
    1800                 :  */
    1801                 : 
    1802                 : /* Error */
    1803                 : #define ERR  0
    1804                 : /* Begin */
    1805                 : #define BGN 11
    1806                 : /* Continuation states, expect 1/2/3 continuation bytes */
    1807                 : #define CS1 16
    1808                 : #define CS2  1
    1809                 : #define CS3  5
    1810                 : /* Partial states, where the first continuation byte has a restricted range */
    1811                 : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
    1812                 : #define P3B 20                  /* Lead was ED, check for surrogate */
    1813                 : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
    1814                 : #define P4B 30                  /* Lead was F4, check for too-large */
    1815                 : /* Begin and End are the same state */
    1816                 : #define END BGN
    1817                 : 
    1818                 : /* the encoded state transitions for the lookup table */
    1819                 : 
    1820                 : /* ASCII */
    1821                 : #define ASC (END << BGN)
    1822                 : /* 2-byte lead */
    1823                 : #define L2A (CS1 << BGN)
    1824                 : /* 3-byte lead */
    1825                 : #define L3A (P3A << BGN)
    1826                 : #define L3B (CS2 << BGN)
    1827                 : #define L3C (P3B << BGN)
    1828                 : /* 4-byte lead */
    1829                 : #define L4A (P4A << BGN)
    1830                 : #define L4B (CS3 << BGN)
    1831                 : #define L4C (P4B << BGN)
    1832                 : /* continuation byte */
    1833                 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
    1834                 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
    1835                 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
    1836                 : /* invalid byte */
    1837                 : #define ILL ERR
    1838                 : 
    1839                 : static const uint32 Utf8Transition[256] =
    1840                 : {
    1841                 :     /* ASCII */
    1842                 : 
    1843                 :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1844                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1845                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1846                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1847                 : 
    1848                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1849                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1850                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1851                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1852                 : 
    1853                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1854                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1855                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1856                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1857                 : 
    1858                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1859                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1860                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1861                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
    1862                 : 
    1863                 :     /* continuation bytes */
    1864                 : 
    1865                 :     /* 80..8F */
    1866                 :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1867                 :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
    1868                 : 
    1869                 :     /* 90..9F */
    1870                 :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1871                 :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
    1872                 : 
    1873                 :     /* A0..BF */
    1874                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1875                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1876                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1877                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
    1878                 : 
    1879                 :     /* leading bytes */
    1880                 : 
    1881                 :     /* C0..DF */
    1882                 :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
    1883                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1884                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1885                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
    1886                 : 
    1887                 :     /* E0..EF */
    1888                 :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
    1889                 :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
    1890                 : 
    1891                 :     /* F0..FF */
    1892                 :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
    1893                 :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
    1894                 : };
    1895                 : 
    1896                 : static void
    1897             751 : utf8_advance(const unsigned char *s, uint32 *state, int len)
    1898                 : {
    1899                 :     /* Note: We deliberately don't check the state's value here. */
    1900           24783 :     while (len > 0)
    1901                 :     {
    1902                 :         /*
    1903                 :          * It's important that the mask value is 31: In most instruction sets,
    1904                 :          * a shift by a 32-bit operand is understood to be a shift by its mod
    1905                 :          * 32, so the compiler should elide the mask operation.
    1906                 :          */
    1907           24032 :         *state = Utf8Transition[*s++] >> (*state & 31);
    1908           24032 :         len--;
    1909                 :     }
    1910                 : 
    1911             751 :     *state &= 31;
    1912             751 : }
    1913                 : 
    1914                 : static int
    1915         1263025 : pg_utf8_verifystr(const unsigned char *s, int len)
    1916                 : {
    1917         1263025 :     const unsigned char *start = s;
    1918         1263025 :     const int   orig_len = len;
    1919         1263025 :     uint32      state = BGN;
    1920                 : 
    1921                 : /*
    1922                 :  * With a stride of two vector widths, gcc will unroll the loop. Even if
    1923                 :  * the compiler can unroll a longer loop, it's not worth it because we
    1924                 :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
    1925                 :  */
    1926                 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
    1927                 : 
    1928 GIC     1263025 :     if (len >= STRIDE_LENGTH)
    1929 ECB             :     {
    1930 GIC     4896864 :         while (len >= STRIDE_LENGTH)
    1931 ECB             :         {
    1932                 :             /*
    1933                 :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
    1934                 :              * but we must first check for a non-END state, which means the
    1935                 :              * previous chunk ended in the middle of a multibyte sequence.
    1936                 :              */
    1937 GIC     4159691 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
    1938 CBC         751 :                 utf8_advance(s, &state, STRIDE_LENGTH);
    1939 ECB             : 
    1940 GIC     4159691 :             s += STRIDE_LENGTH;
    1941 CBC     4159691 :             len -= STRIDE_LENGTH;
    1942 ECB             :         }
    1943                 : 
    1944                 :         /* The error state persists, so we only need to check for it here. */
    1945 GIC      737173 :         if (state == ERR)
    1946 ECB             :         {
    1947                 :             /*
    1948                 :              * Start over from the beginning with the slow path so we can
    1949                 :              * count the valid bytes.
    1950                 :              */
    1951 GIC         252 :             len = orig_len;
    1952 CBC         252 :             s = start;
    1953 ECB             :         }
    1954 GIC      736921 :         else if (state != END)
    1955 ECB             :         {
    1956                 :             /*
    1957                 :              * The fast path exited in the middle of a multibyte sequence.
    1958                 :              * Walk backwards to find the leading byte so that the slow path
    1959                 :              * can resume checking from there. We must always backtrack at
    1960                 :              * least one byte, since the current byte could be e.g. an ASCII
    1961                 :              * byte after a 2-byte lead, which is invalid.
    1962                 :              */
    1963                 :             do
    1964                 :             {
    1965 GIC          51 :                 Assert(s > start);
    1966 CBC          51 :                 s--;
    1967              51 :                 len++;
    1968              51 :                 Assert(IS_HIGHBIT_SET(*s));
    1969              51 :             } while (pg_utf_mblen(s) <= 1);
    1970 ECB             :         }
    1971                 :     }
    1972                 : 
    1973                 :     /* check remaining bytes */
    1974 GIC    15833150 :     while (len > 0)
    1975 ECB             :     {
    1976                 :         int         l;
    1977                 : 
    1978                 :         /* fast path for ASCII-subset characters */
    1979 GIC    14571209 :         if (!IS_HIGHBIT_SET(*s))
    1980 ECB             :         {
    1981 GIC    14566031 :             if (*s == '\0')
    1982 CBC          88 :                 break;
    1983        14565943 :             l = 1;
    1984 ECB             :         }
    1985                 :         else
    1986                 :         {
    1987 GIC        5178 :             l = pg_utf8_verifychar(s, len);
    1988 CBC        5178 :             if (l == -1)
    1989             996 :                 break;
    1990 ECB             :         }
    1991 GIC    14570125 :         s += l;
    1992 CBC    14570125 :         len -= l;
    1993 ECB             :     }
    1994                 : 
    1995 GIC     1263025 :     return s - start;
    1996 ECB             : }
    1997                 : 
    1998                 : /*
    1999                 :  * Check for validity of a single UTF-8 encoded character
    2000                 :  *
    2001                 :  * This directly implements the rules in RFC3629.  The bizarre-looking
    2002                 :  * restrictions on the second byte are meant to ensure that there isn't
    2003                 :  * more than one encoding of a given Unicode character point; that is,
    2004                 :  * you may not use a longer-than-necessary byte sequence with high order
    2005                 :  * zero bits to represent a character that would fit in fewer bytes.
    2006                 :  * To do otherwise is to create security hazards (eg, create an apparent
    2007                 :  * non-ASCII character that decodes to plain ASCII).
    2008                 :  *
    2009                 :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    2010                 :  * caller must have checked that that many bytes are present in the buffer.
    2011                 :  */
    2012                 : bool
    2013 GIC        7892 : pg_utf8_islegal(const unsigned char *source, int length)
    2014 ECB             : {
    2015                 :     unsigned char a;
    2016                 : 
    2017 GIC        7892 :     switch (length)
    2018 ECB             :     {
    2019 UIC           0 :         default:
    2020 EUB             :             /* reject lengths 5 and 6 for now */
    2021 UIC           0 :             return false;
    2022 GBC         368 :         case 4:
    2023 CBC         368 :             a = source[3];
    2024             368 :             if (a < 0x80 || a > 0xBF)
    2025              48 :                 return false;
    2026 ECB             :             /* FALL THRU */
    2027                 :         case 3:
    2028 GIC        3855 :             a = source[2];
    2029 CBC        3855 :             if (a < 0x80 || a > 0xBF)
    2030             300 :                 return false;
    2031 ECB             :             /* FALL THRU */
    2032                 :         case 2:
    2033 GIC        5733 :             a = source[1];
    2034 CBC        5733 :             switch (*source)
    2035 ECB             :             {
    2036 GIC         156 :                 case 0xE0:
    2037 CBC         156 :                     if (a < 0xA0 || a > 0xBF)
    2038             132 :                         return false;
    2039              24 :                     break;
    2040             156 :                 case 0xED:
    2041             156 :                     if (a < 0x80 || a > 0x9F)
    2042             132 :                         return false;
    2043              24 :                     break;
    2044             230 :                 case 0xF0:
    2045             230 :                     if (a < 0x90 || a > 0xBF)
    2046             132 :                         return false;
    2047              98 :                     break;
    2048              90 :                 case 0xF4:
    2049              90 :                     if (a < 0x80 || a > 0x8F)
    2050              66 :                         return false;
    2051              24 :                     break;
    2052            5101 :                 default:
    2053            5101 :                     if (a < 0x80 || a > 0xBF)
    2054              48 :                         return false;
    2055            5053 :                     break;
    2056 ECB             :             }
    2057                 :             /* FALL THRU */
    2058                 :         case 1:
    2059 GIC        7034 :             a = *source;
    2060 CBC        7034 :             if (a >= 0x80 && a < 0xC2)
    2061             198 :                 return false;
    2062            6836 :             if (a > 0xF4)
    2063              66 :                 return false;
    2064            6770 :             break;
    2065 ECB             :     }
    2066 GIC        6770 :     return true;
    2067 ECB             : }
    2068                 : 
    2069                 : 
    2070                 : /*
    2071                 :  *-------------------------------------------------------------------
    2072                 :  * encoding info table
    2073                 :  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
    2074                 :  *-------------------------------------------------------------------
    2075                 :  */
    2076                 : const pg_wchar_tbl pg_wchar_table[] = {
    2077                 :     {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},   /* PG_SQL_ASCII */
    2078                 :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JP */
    2079                 :     {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},  /* PG_EUC_CN */
    2080                 :     {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},  /* PG_EUC_KR */
    2081                 :     {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},  /* PG_EUC_TW */
    2082                 :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JIS_2004 */
    2083                 :     {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},  /* PG_UTF8 */
    2084                 :     {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},  /* PG_MULE_INTERNAL */
    2085                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN1 */
    2086                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN2 */
    2087                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN3 */
    2088                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN4 */
    2089                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN5 */
    2090                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN6 */
    2091                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN7 */
    2092                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN8 */
    2093                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN9 */
    2094                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN10 */
    2095                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1256 */
    2096                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1258 */
    2097                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN866 */
    2098                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN874 */
    2099                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8R */
    2100                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1251 */
    2101                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1252 */
    2102                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-5 */
    2103                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-6 */
    2104                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-7 */
    2105                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-8 */
    2106                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1250 */
    2107                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1253 */
    2108                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1254 */
    2109                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1255 */
    2110                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1257 */
    2111                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8U */
    2112                 :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},    /* PG_SJIS */
    2113                 :     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},    /* PG_BIG5 */
    2114                 :     {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},    /* PG_GBK */
    2115                 :     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},    /* PG_UHC */
    2116                 :     {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},    /* PG_GB18030 */
    2117                 :     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},    /* PG_JOHAB */
    2118                 :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
    2119                 : };
    2120                 : 
    2121                 : /*
    2122                 :  * Returns the byte length of a multibyte character.
    2123                 :  *
    2124                 :  * Caution: when dealing with text that is not certainly valid in the
    2125                 :  * specified encoding, the result may exceed the actual remaining
    2126                 :  * string length.  Callers that are not prepared to deal with that
    2127                 :  * should use pg_encoding_mblen_bounded() instead.
    2128                 :  */
    2129                 : int
    2130 GIC    24328982 : pg_encoding_mblen(int encoding, const char *mbstr)
    2131 ECB             : {
    2132 GIC    24328982 :     return (PG_VALID_ENCODING(encoding) ?
    2133 CBC    48657964 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
    2134 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
    2135 EUB             : }
    2136                 : 
    2137                 : /*
    2138                 :  * Returns the byte length of a multibyte character; but not more than
    2139                 :  * the distance to end of string.
    2140                 :  */
    2141                 : int
    2142 GIC          60 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
    2143 ECB             : {
    2144 GIC          60 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
    2145 ECB             : }
    2146                 : 
    2147                 : /*
    2148                 :  * Returns the display length of a multibyte character.
    2149                 :  */
    2150                 : int
    2151 GIC    24249034 : pg_encoding_dsplen(int encoding, const char *mbstr)
    2152 ECB             : {
    2153 GIC    24249034 :     return (PG_VALID_ENCODING(encoding) ?
    2154 CBC    48498068 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
    2155 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
    2156 EUB             : }
    2157                 : 
    2158                 : /*
    2159                 :  * Verify the first multibyte character of the given string.
    2160                 :  * Return its byte length if good, -1 if bad.  (See comments above for
    2161                 :  * full details of the mbverifychar API.)
    2162                 :  */
    2163                 : int
    2164 GIC        1170 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
    2165 ECB             : {
    2166 GIC        1170 :     return (PG_VALID_ENCODING(encoding) ?
    2167 CBC        2340 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
    2168 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
    2169 EUB             : }
    2170                 : 
    2171                 : /*
    2172                 :  * Verify that a string is valid for the given encoding.
    2173                 :  * Returns the number of input bytes (<= len) that form a valid string.
    2174                 :  * (See comments above for full details of the mbverifystr API.)
    2175                 :  */
    2176                 : int
    2177 GIC      216868 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
    2178 ECB             : {
    2179 GIC      216868 :     return (PG_VALID_ENCODING(encoding) ?
    2180 CBC      433736 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
    2181 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
    2182 EUB             : }
    2183                 : 
    2184                 : /*
    2185                 :  * fetch maximum length of a given encoding
    2186                 :  */
    2187                 : int
    2188 GIC      500733 : pg_encoding_max_length(int encoding)
    2189 ECB             : {
    2190 GIC      500733 :     Assert(PG_VALID_ENCODING(encoding));
    2191 ECB             : 
    2192 GIC      500733 :     return pg_wchar_table[encoding].maxmblen;
    2193 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a