LCOV - differential code coverage report
Current view: top level - src/common - wchar.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 51.8 % 866 449 4 2 411 1 34 1 413 5 31 1
Current Date: 2023-04-08 17:13:01 Functions: 61.7 % 81 50 31 7 2 41 7
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (180,240] days: 100.0 % 1 1 1
Legend: Lines: hit not hit (240..) days: 51.8 % 865 448 4 2 411 1 34 413 5 31
Function coverage date bins:
(240..) days: 56.8 % 88 50 31 7 2 41 7

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * wchar.c
                                  4                 :  *    Functions for working with multibyte characters in various encodings.
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1998-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  * IDENTIFICATION
                                  9                 :  *    src/common/wchar.c
                                 10                 :  *
                                 11                 :  *-------------------------------------------------------------------------
                                 12                 :  */
                                 13                 : #include "c.h"
                                 14                 : 
                                 15                 : #include "mb/pg_wchar.h"
                                 16                 : 
                                 17                 : 
                                 18                 : /*
                                 19                 :  * Operations on multi-byte encodings are driven by a table of helper
                                 20                 :  * functions.
                                 21                 :  *
                                 22                 :  * To add an encoding support, define mblen(), dsplen(), verifychar() and
                                 23                 :  * verifystr() for the encoding.  For server-encodings, also define mb2wchar()
                                 24                 :  * and wchar2mb() conversion functions.
                                 25                 :  *
                                 26                 :  * These functions generally assume that their input is validly formed.
                                 27                 :  * The "verifier" functions, further down in the file, have to be more
                                 28                 :  * paranoid.
                                 29                 :  *
                                 30                 :  * We expect that mblen() does not need to examine more than the first byte
                                 31                 :  * of the character to discover the correct length.  GB18030 is an exception
                                 32                 :  * to that rule, though, as it also looks at second byte.  But even that
                                 33                 :  * behaves in a predictable way, if you only pass the first byte: it will
                                 34                 :  * treat 4-byte encoded characters as two 2-byte encoded characters, which is
                                 35                 :  * good enough for all current uses.
                                 36                 :  *
                                 37                 :  * Note: for the display output of psql to work properly, the return values
                                 38                 :  * of the dsplen functions must conform to the Unicode standard. In particular
                                 39                 :  * the NUL character is zero width and control characters are generally
                                 40                 :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
                                 41                 :  * subset to the ASCII routines to ensure consistency.
                                 42                 :  */
                                 43                 : 
                                 44                 : /*
                                 45                 :  * SQL/ASCII
                                 46                 :  */
                                 47                 : static int
 5655 tgl                        48 CBC         246 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                 49                 : {
 8053 bruce                      50             246 :     int         cnt = 0;
                                 51                 : 
 8067 tgl                        52            4657 :     while (len > 0 && *from)
                                 53                 :     {
 8986 bruce                      54            4411 :         *to++ = *from++;
                                 55            4411 :         len--;
 8260 ishii                      56            4411 :         cnt++;
                                 57                 :     }
 8986 bruce                      58             246 :     *to = 0;
 6315                            59             246 :     return cnt;
                                 60                 : }
                                 61                 : 
                                 62                 : static int
 8986                            63             837 : pg_ascii_mblen(const unsigned char *s)
                                 64                 : {
 6315                            65             837 :     return 1;
                                 66                 : }
                                 67                 : 
                                 68                 : static int
 6964 ishii                      69             400 : pg_ascii_dsplen(const unsigned char *s)
                                 70                 : {
 6267 bruce                      71             400 :     if (*s == '\0')
 6267 bruce                      72 UBC           0 :         return 0;
 6267 bruce                      73 CBC         400 :     if (*s < 0x20 || *s == 0x7f)
 6267 bruce                      74 UBC           0 :         return -1;
                                 75                 : 
 6315 bruce                      76 CBC         400 :     return 1;
                                 77                 : }
                                 78                 : 
                                 79                 : /*
                                 80                 :  * EUC
                                 81                 :  */
                                 82                 : static int
 5655 tgl                        83 UBC           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                 84                 : {
 8053 bruce                      85               0 :     int         cnt = 0;
                                 86                 : 
 8067 tgl                        87               0 :     while (len > 0 && *from)
                                 88                 :     {
 6031 bruce                      89               0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
                                 90                 :                                          * KANA") */
                                 91                 :         {
 8986                            92               0 :             from++;
 6315 ishii                      93               0 :             *to = (SS2 << 8) | *from++;
 8067 tgl                        94               0 :             len -= 2;
                                 95                 :         }
 2118                            96               0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
                                 97                 :         {
 8986 bruce                      98               0 :             from++;
 6315 ishii                      99               0 :             *to = (SS3 << 16) | (*from++ << 8);
                                100               0 :             *to |= *from++;
 8986 bruce                     101               0 :             len -= 3;
                                102                 :         }
 2118 tgl                       103               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
                                104                 :         {
 8986 bruce                     105               0 :             *to = *from++ << 8;
                                106               0 :             *to |= *from++;
                                107               0 :             len -= 2;
                                108                 :         }
                                109                 :         else                    /* must be ASCII */
                                110                 :         {
                                111               0 :             *to = *from++;
                                112               0 :             len--;
                                113                 :         }
                                114               0 :         to++;
 8260 ishii                     115               0 :         cnt++;
                                116                 :     }
 8986 bruce                     117               0 :     *to = 0;
 6315                           118               0 :     return cnt;
                                119                 : }
                                120                 : 
                                121                 : static inline int
 8986 bruce                     122 CBC          90 : pg_euc_mblen(const unsigned char *s)
                                123                 : {
                                124                 :     int         len;
                                125                 : 
                                126              90 :     if (*s == SS2)
 8986 bruce                     127 UBC           0 :         len = 2;
 8986 bruce                     128 CBC          90 :     else if (*s == SS3)
 8986 bruce                     129 UBC           0 :         len = 3;
 6314 bruce                     130 CBC          90 :     else if (IS_HIGHBIT_SET(*s))
 8986                           131              54 :         len = 2;
                                132                 :     else
                                133              36 :         len = 1;
 6315                           134              90 :     return len;
                                135                 : }
                                136                 : 
                                137                 : static inline int
 6964 ishii                     138 UBC           0 : pg_euc_dsplen(const unsigned char *s)
                                139                 : {
                                140                 :     int         len;
                                141                 : 
                                142               0 :     if (*s == SS2)
                                143               0 :         len = 2;
                                144               0 :     else if (*s == SS3)
                                145               0 :         len = 2;
 6314 bruce                     146               0 :     else if (IS_HIGHBIT_SET(*s))
 6964 ishii                     147               0 :         len = 2;
                                148                 :     else
 6267 bruce                     149               0 :         len = pg_ascii_dsplen(s);
 6315                           150               0 :     return len;
                                151                 : }
                                152                 : 
                                153                 : /*
                                154                 :  * EUC_JP
                                155                 :  */
                                156                 : static int
 5655 tgl                       157               0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                158                 : {
 6315 bruce                     159               0 :     return pg_euc2wchar_with_len(from, to, len);
                                160                 : }
                                161                 : 
                                162                 : static int
 8986 bruce                     163 CBC          90 : pg_eucjp_mblen(const unsigned char *s)
                                164                 : {
 6315                           165              90 :     return pg_euc_mblen(s);
                                166                 : }
                                167                 : 
                                168                 : static int
 6964 ishii                     169 UBC           0 : pg_eucjp_dsplen(const unsigned char *s)
                                170                 : {
                                171                 :     int         len;
                                172                 : 
                                173               0 :     if (*s == SS2)
                                174               0 :         len = 1;
                                175               0 :     else if (*s == SS3)
                                176               0 :         len = 2;
 6314 bruce                     177               0 :     else if (IS_HIGHBIT_SET(*s))
 6964 ishii                     178               0 :         len = 2;
                                179                 :     else
 6267 bruce                     180               0 :         len = pg_ascii_dsplen(s);
 6315                           181               0 :     return len;
                                182                 : }
                                183                 : 
                                184                 : /*
                                185                 :  * EUC_KR
                                186                 :  */
                                187                 : static int
 5655 tgl                       188               0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                189                 : {
 6315 bruce                     190               0 :     return pg_euc2wchar_with_len(from, to, len);
                                191                 : }
                                192                 : 
                                193                 : static int
 8986                           194               0 : pg_euckr_mblen(const unsigned char *s)
                                195                 : {
 6315                           196               0 :     return pg_euc_mblen(s);
                                197                 : }
                                198                 : 
                                199                 : static int
 6964 ishii                     200               0 : pg_euckr_dsplen(const unsigned char *s)
                                201                 : {
 6315 bruce                     202               0 :     return pg_euc_dsplen(s);
                                203                 : }
                                204                 : 
                                205                 : /*
                                206                 :  * EUC_CN
                                207                 :  *
                                208                 :  */
                                209                 : static int
 5655 tgl                       210               0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                211                 : {
 8053 bruce                     212               0 :     int         cnt = 0;
                                213                 : 
 8067 tgl                       214               0 :     while (len > 0 && *from)
                                215                 :     {
 6315 ishii                     216               0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
                                217                 :         {
 8986 bruce                     218               0 :             from++;
 6315 ishii                     219               0 :             *to = (SS2 << 16) | (*from++ << 8);
                                220               0 :             *to |= *from++;
 8067 tgl                       221               0 :             len -= 3;
                                222                 :         }
 2118                           223               0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
                                224                 :         {
 8986 bruce                     225               0 :             from++;
 6315 ishii                     226               0 :             *to = (SS3 << 16) | (*from++ << 8);
                                227               0 :             *to |= *from++;
 8986 bruce                     228               0 :             len -= 3;
                                229                 :         }
 2118 tgl                       230               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
                                231                 :         {
 8986 bruce                     232               0 :             *to = *from++ << 8;
                                233               0 :             *to |= *from++;
                                234               0 :             len -= 2;
                                235                 :         }
                                236                 :         else
                                237                 :         {
                                238               0 :             *to = *from++;
                                239               0 :             len--;
                                240                 :         }
                                241               0 :         to++;
 8260 ishii                     242               0 :         cnt++;
                                243                 :     }
 8986 bruce                     244               0 :     *to = 0;
 6315                           245               0 :     return cnt;
                                246                 : }
                                247                 : 
                                248                 : static int
 8986                           249               0 : pg_euccn_mblen(const unsigned char *s)
                                250                 : {
                                251                 :     int         len;
                                252                 : 
 6314                           253               0 :     if (IS_HIGHBIT_SET(*s))
 8986                           254               0 :         len = 2;
                                255                 :     else
                                256               0 :         len = 1;
 6315                           257               0 :     return len;
                                258                 : }
                                259                 : 
                                260                 : static int
 6964 ishii                     261               0 : pg_euccn_dsplen(const unsigned char *s)
                                262                 : {
                                263                 :     int         len;
                                264                 : 
 6314 bruce                     265               0 :     if (IS_HIGHBIT_SET(*s))
 6964 ishii                     266               0 :         len = 2;
                                267                 :     else
 6267 bruce                     268               0 :         len = pg_ascii_dsplen(s);
 6315                           269               0 :     return len;
                                270                 : }
                                271                 : 
                                272                 : /*
                                273                 :  * EUC_TW
                                274                 :  *
                                275                 :  */
                                276                 : static int
 5655 tgl                       277               0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                278                 : {
 8053 bruce                     279               0 :     int         cnt = 0;
                                280                 : 
 8067 tgl                       281               0 :     while (len > 0 && *from)
                                282                 :     {
 6315 ishii                     283               0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
                                284                 :         {
 8986 bruce                     285               0 :             from++;
 5750 tgl                       286               0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
 8986 bruce                     287               0 :             *to |= *from++ << 8;
                                288               0 :             *to |= *from++;
 8067 tgl                       289               0 :             len -= 4;
                                290                 :         }
 2118                           291               0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
                                292                 :         {
 8986 bruce                     293               0 :             from++;
 6315 ishii                     294               0 :             *to = (SS3 << 16) | (*from++ << 8);
                                295               0 :             *to |= *from++;
 8986 bruce                     296               0 :             len -= 3;
                                297                 :         }
 2118 tgl                       298               0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
                                299                 :         {
 8986 bruce                     300               0 :             *to = *from++ << 8;
                                301               0 :             *to |= *from++;
                                302               0 :             len -= 2;
                                303                 :         }
                                304                 :         else
                                305                 :         {
                                306               0 :             *to = *from++;
                                307               0 :             len--;
                                308                 :         }
                                309               0 :         to++;
 8260 ishii                     310               0 :         cnt++;
                                311                 :     }
 8986 bruce                     312               0 :     *to = 0;
 6315                           313               0 :     return cnt;
                                314                 : }
                                315                 : 
                                316                 : static int
 8986                           317               0 : pg_euctw_mblen(const unsigned char *s)
                                318                 : {
                                319                 :     int         len;
                                320                 : 
                                321               0 :     if (*s == SS2)
                                322               0 :         len = 4;
                                323               0 :     else if (*s == SS3)
                                324               0 :         len = 3;
 6314                           325               0 :     else if (IS_HIGHBIT_SET(*s))
 8986                           326               0 :         len = 2;
                                327                 :     else
 6167 tgl                       328               0 :         len = 1;
 6315 bruce                     329               0 :     return len;
                                330                 : }
                                331                 : 
                                332                 : static int
 6964 ishii                     333               0 : pg_euctw_dsplen(const unsigned char *s)
                                334                 : {
                                335                 :     int         len;
                                336                 : 
                                337               0 :     if (*s == SS2)
                                338               0 :         len = 2;
                                339               0 :     else if (*s == SS3)
                                340               0 :         len = 2;
 6314 bruce                     341               0 :     else if (IS_HIGHBIT_SET(*s))
 6964 ishii                     342               0 :         len = 2;
                                343                 :     else
 6267 bruce                     344               0 :         len = pg_ascii_dsplen(s);
 6315                           345               0 :     return len;
                                346                 : }
                                347                 : 
                                348                 : /*
                                349                 :  * Convert pg_wchar to EUC_* encoding.
                                350                 :  * caller must allocate enough space for "to", including a trailing zero!
                                351                 :  * len: length of from.
                                352                 :  * "from" not necessarily null terminated.
                                353                 :  */
                                354                 : static int
 3931 rhaas                     355               0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
                                356                 : {
                                357               0 :     int         cnt = 0;
                                358                 : 
                                359               0 :     while (len > 0 && *from)
                                360                 :     {
                                361                 :         unsigned char c;
                                362                 : 
 3925 tgl                       363               0 :         if ((c = (*from >> 24)))
                                364                 :         {
 3931 rhaas                     365               0 :             *to++ = c;
                                366               0 :             *to++ = (*from >> 16) & 0xff;
                                367               0 :             *to++ = (*from >> 8) & 0xff;
                                368               0 :             *to++ = *from & 0xff;
                                369               0 :             cnt += 4;
                                370                 :         }
 3925 tgl                       371               0 :         else if ((c = (*from >> 16)))
                                372                 :         {
 3931 rhaas                     373               0 :             *to++ = c;
                                374               0 :             *to++ = (*from >> 8) & 0xff;
                                375               0 :             *to++ = *from & 0xff;
                                376               0 :             cnt += 3;
                                377                 :         }
 3925 tgl                       378               0 :         else if ((c = (*from >> 8)))
                                379                 :         {
 3931 rhaas                     380               0 :             *to++ = c;
                                381               0 :             *to++ = *from & 0xff;
                                382               0 :             cnt += 2;
                                383                 :         }
                                384                 :         else
                                385                 :         {
                                386               0 :             *to++ = *from;
                                387               0 :             cnt++;
                                388                 :         }
 3930                           389               0 :         from++;
 3931                           390               0 :         len--;
                                391                 :     }
                                392               0 :     *to = 0;
                                393               0 :     return cnt;
                                394                 : }
                                395                 : 
                                396                 : 
                                397                 : /*
                                398                 :  * JOHAB
                                399                 :  */
                                400                 : static int
 7705 bruce                     401               0 : pg_johab_mblen(const unsigned char *s)
                                402                 : {
 6315                           403               0 :     return pg_euc_mblen(s);
                                404                 : }
                                405                 : 
                                406                 : static int
 6964 ishii                     407               0 : pg_johab_dsplen(const unsigned char *s)
                                408                 : {
 6315 bruce                     409               0 :     return pg_euc_dsplen(s);
                                410                 : }
                                411                 : 
                                412                 : /*
                                413                 :  * convert UTF8 string to pg_wchar (UCS-4)
                                414                 :  * caller must allocate enough space for "to", including a trailing zero!
                                415                 :  * len: length of from.
                                416                 :  * "from" not necessarily null terminated.
                                417                 :  */
                                418                 : static int
 7836 bruce                     419 CBC      467195 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                420                 : {
 8053                           421          467195 :     int         cnt = 0;
                                422                 :     uint32      c1,
                                423                 :                 c2,
                                424                 :                 c3,
                                425                 :                 c4;
                                426                 : 
 8067 tgl                       427        24423147 :     while (len > 0 && *from)
                                428                 :     {
 5919                           429        23955952 :         if ((*from & 0x80) == 0)
                                430                 :         {
 8986 bruce                     431        23955634 :             *to = *from++;
                                432        23955634 :             len--;
                                433                 :         }
 5919 tgl                       434             318 :         else if ((*from & 0xe0) == 0xc0)
                                435                 :         {
                                436             278 :             if (len < 2)
 5919 tgl                       437 UBC           0 :                 break;          /* drop trailing incomplete char */
 8986 bruce                     438 CBC         278 :             c1 = *from++ & 0x1f;
                                439             278 :             c2 = *from++ & 0x3f;
 5919 tgl                       440             278 :             *to = (c1 << 6) | c2;
 8067                           441             278 :             len -= 2;
                                442                 :         }
 5919                           443              40 :         else if ((*from & 0xf0) == 0xe0)
                                444                 :         {
                                445              40 :             if (len < 3)
 5919 tgl                       446 UBC           0 :                 break;          /* drop trailing incomplete char */
 8986 bruce                     447 CBC          40 :             c1 = *from++ & 0x0f;
                                448              40 :             c2 = *from++ & 0x3f;
                                449              40 :             c3 = *from++ & 0x3f;
 5919 tgl                       450              40 :             *to = (c1 << 12) | (c2 << 6) | c3;
 8067                           451              40 :             len -= 3;
                                452                 :         }
 5919 tgl                       453 UBC           0 :         else if ((*from & 0xf8) == 0xf0)
                                454                 :         {
                                455               0 :             if (len < 4)
                                456               0 :                 break;          /* drop trailing incomplete char */
                                457               0 :             c1 = *from++ & 0x07;
                                458               0 :             c2 = *from++ & 0x3f;
                                459               0 :             c3 = *from++ & 0x3f;
                                460               0 :             c4 = *from++ & 0x3f;
                                461               0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
                                462               0 :             len -= 4;
                                463                 :         }
                                464                 :         else
                                465                 :         {
                                466                 :             /* treat a bogus char as length 1; not ours to raise error */
 8750                           467               0 :             *to = *from++;
                                468               0 :             len--;
                                469                 :         }
 8986 bruce                     470 CBC    23955952 :         to++;
 8260 ishii                     471        23955952 :         cnt++;
                                472                 :     }
 8986 bruce                     473          467195 :     *to = 0;
 6315                           474          467195 :     return cnt;
                                475                 : }
                                476                 : 
                                477                 : 
                                478                 : /*
                                479                 :  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
                                480                 :  * space allocated.
                                481                 :  */
                                482                 : unsigned char *
 5275 peter_e                   483         7818723 : unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
                                484                 : {
                                485         7818723 :     if (c <= 0x7F)
                                486                 :     {
                                487         7818439 :         utf8string[0] = c;
                                488                 :     }
                                489             284 :     else if (c <= 0x7FF)
                                490                 :     {
                                491             132 :         utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
                                492             132 :         utf8string[1] = 0x80 | (c & 0x3F);
                                493                 :     }
                                494             152 :     else if (c <= 0xFFFF)
                                495                 :     {
                                496             122 :         utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
                                497             122 :         utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
                                498             122 :         utf8string[2] = 0x80 | (c & 0x3F);
                                499                 :     }
                                500                 :     else
                                501                 :     {
                                502              30 :         utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
                                503              30 :         utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
                                504              30 :         utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
                                505              30 :         utf8string[3] = 0x80 | (c & 0x3F);
                                506                 :     }
                                507                 : 
                                508         7818723 :     return utf8string;
                                509                 : }
                                510                 : 
                                511                 : /*
                                512                 :  * Trivial conversion from pg_wchar to UTF-8.
                                513                 :  * caller should allocate enough space for "to"
                                514                 :  * len: length of from.
                                515                 :  * "from" not necessarily null terminated.
                                516                 :  */
                                517                 : static int
 3931 rhaas                     518          555690 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
                                519                 : {
                                520          555690 :     int         cnt = 0;
                                521                 : 
                                522         8374066 :     while (len > 0 && *from)
                                523                 :     {
                                524                 :         int         char_len;
                                525                 : 
                                526         7818376 :         unicode_to_utf8(*from, to);
                                527         7818376 :         char_len = pg_utf_mblen(to);
                                528         7818376 :         cnt += char_len;
                                529         7818376 :         to += char_len;
 3930                           530         7818376 :         from++;
                                531         7818376 :         len--;
                                532                 :     }
 3931                           533          555690 :     *to = 0;
                                534          555690 :     return cnt;
                                535                 : }
                                536                 : 
                                537                 : /*
                                538                 :  * Return the byte length of a UTF8 character pointed to by s
                                539                 :  *
                                540                 :  * Note: in the current implementation we do not support UTF8 sequences
                                541                 :  * of more than 4 bytes; hence do NOT return a value larger than 4.
                                542                 :  * We return "1" for any leading byte that is either flat-out illegal or
                                543                 :  * indicates a length larger than we support.
                                544                 :  *
                                545                 :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
                                546                 :  * other places would need to be fixed to change this.
                                547                 :  */
                                548                 : int
 6701 bruce                     549       144827375 : pg_utf_mblen(const unsigned char *s)
                                550                 : {
                                551                 :     int         len;
                                552                 : 
 8986                           553       144827375 :     if ((*s & 0x80) == 0)
                                554       144814332 :         len = 1;
                                555           13043 :     else if ((*s & 0xe0) == 0xc0)
                                556            6366 :         len = 2;
 6385                           557            6677 :     else if ((*s & 0xf0) == 0xe0)
                                558            6227 :         len = 3;
                                559             450 :     else if ((*s & 0xf8) == 0xf0)
                                560             364 :         len = 4;
                                561                 : #ifdef NOT_USED
                                562                 :     else if ((*s & 0xfc) == 0xf8)
                                563                 :         len = 5;
                                564                 :     else if ((*s & 0xfe) == 0xfc)
                                565                 :         len = 6;
                                566                 : #endif
                                567                 :     else
 5919 tgl                       568              86 :         len = 1;
 6315 bruce                     569       144827375 :     return len;
                                570                 : }
                                571                 : 
                                572                 : /*
                                573                 :  * This is an implementation of wcwidth() and wcswidth() as defined in
                                574                 :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
                                575                 :  * <http://www.unix.org/online.html>
                                576                 :  *
                                577                 :  * Markus Kuhn -- 2001-09-08 -- public domain
                                578                 :  *
                                579                 :  * customised for PostgreSQL
                                580                 :  *
                                581                 :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
                                582                 :  */
                                583                 : 
                                584                 : struct mbinterval
                                585                 : {
                                586                 :     unsigned int first;
                                587                 :     unsigned int last;
                                588                 : };
                                589                 : 
                                590                 : /* auxiliary function for binary search in interval table */
                                591                 : static int
 2118 tgl                       592        48447406 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
                                593                 : {
 6267 bruce                     594        48447406 :     int         min = 0;
                                595                 :     int         mid;
                                596                 : 
                                597        48447406 :     if (ucs < table[0].first || ucs > table[max].last)
  591 john.naylor               598        48444614 :         return 0;
 6267 bruce                     599           24732 :     while (max >= min)
                                600                 :     {
                                601           22060 :         mid = (min + max) / 2;
                                602           22060 :         if (ucs > table[mid].last)
                                603            3644 :             min = mid + 1;
                                604           18416 :         else if (ucs < table[mid].first)
                                605           18296 :             max = mid - 1;
                                606                 :         else
  591 john.naylor               607             120 :             return 1;
                                608                 :     }
                                609                 : 
                                610            2672 :     return 0;
                                611                 : }
                                612                 : 
                                613                 : 
                                614                 : /* The following functions define the column width of an ISO 10646
                                615                 :  * character as follows:
                                616                 :  *
                                617                 :  *    - The null character (U+0000) has a column width of 0.
                                618                 :  *
                                619                 :  *    - Other C0/C1 control characters and DEL will lead to a return
                                620                 :  *      value of -1.
                                621                 :  *
                                622                 :  *    - Non-spacing and enclosing combining characters (general
                                623                 :  *      category code Mn, Me or Cf in the Unicode database) have a
                                624                 :  *      column width of 0.
                                625                 :  *
                                626                 :  *    - Spacing characters in the East Asian Wide (W) or East Asian
                                627                 :  *      FullWidth (F) category as defined in Unicode Technical
                                628                 :  *      Report #11 have a column width of 2.
                                629                 :  *
                                630                 :  *    - All remaining characters (including all printable
                                631                 :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
                                632                 :  *      etc.) have a column width of 1.
                                633                 :  *
                                634                 :  * This implementation assumes that wchar_t characters are encoded
                                635                 :  * in ISO 10646.
                                636                 :  */
                                637                 : 
                                638                 : static int
 6267 bruce                     639        24252996 : ucs_wcwidth(pg_wchar ucs)
                                640                 : {
                                641                 : #include "common/unicode_nonspacing_table.h"
                                642                 : #include "common/unicode_east_asian_fw_table.h"
                                643                 : 
                                644                 :     /* test for 8-bit control characters */
                                645        24252996 :     if (ucs == 0)
 6267 bruce                     646 UBC           0 :         return 0;
                                647                 : 
 6267 bruce                     648 CBC    24252996 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
                                649           29257 :         return -1;
                                650                 : 
                                651                 :     /*
                                652                 :      * binary search in table of non-spacing characters
                                653                 :      *
                                654                 :      * XXX: In the official Unicode sources, it is possible for a character to
                                655                 :      * be described as both non-spacing and wide at the same time. As of
                                656                 :      * Unicode 13.0, treating the non-spacing property as the determining
                                657                 :      * factor for display width leads to the correct behavior, so do that
                                658                 :      * search first.
                                659                 :      */
  208 john.naylor               660 GNC    24223739 :     if (mbbisearch(ucs, nonspacing,
                                661                 :                    sizeof(nonspacing) / sizeof(struct mbinterval) - 1))
  591 john.naylor               662 CBC          72 :         return 0;
                                663                 : 
                                664                 :     /* binary search in table of wide characters */
                                665        24223667 :     if (mbbisearch(ucs, east_asian_fw,
                                666                 :                    sizeof(east_asian_fw) / sizeof(struct mbinterval) - 1))
                                667              48 :         return 2;
                                668                 : 
                                669        24223619 :     return 1;
                                670                 : }
                                671                 : 
                                672                 : /*
                                673                 :  * Convert a UTF-8 character to a Unicode code point.
                                674                 :  * This is a one-character version of pg_utf2wchar_with_len.
                                675                 :  *
                                676                 :  * No error checks here, c must point to a long-enough string.
                                677                 :  */
                                678                 : pg_wchar
 4617 tgl                       679        24253729 : utf8_to_unicode(const unsigned char *c)
                                680                 : {
 6267 bruce                     681        24253729 :     if ((*c & 0x80) == 0)
                                682        24250873 :         return (pg_wchar) c[0];
                                683            2856 :     else if ((*c & 0xe0) == 0xc0)
                                684            2654 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
                                685            2654 :                            (c[1] & 0x3f));
                                686             202 :     else if ((*c & 0xf0) == 0xe0)
                                687             154 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
                                688             154 :                            ((c[1] & 0x3f) << 6) |
                                689             154 :                            (c[2] & 0x3f));
 5919 tgl                       690              48 :     else if ((*c & 0xf8) == 0xf0)
 6267 bruce                     691              48 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
                                692              48 :                            ((c[1] & 0x3f) << 12) |
                                693              48 :                            ((c[2] & 0x3f) << 6) |
                                694              48 :                            (c[3] & 0x3f));
                                695                 :     else
                                696                 :         /* that is an invalid code on purpose */
 6267 bruce                     697 UBC           0 :         return 0xffffffff;
                                698                 : }
                                699                 : 
                                700                 : static int
 6701 bruce                     701 CBC    24252996 : pg_utf_dsplen(const unsigned char *s)
                                702                 : {
 4617 tgl                       703        24252996 :     return ucs_wcwidth(utf8_to_unicode(s));
                                704                 : }
                                705                 : 
                                706                 : /*
                                707                 :  * convert mule internal code to pg_wchar
                                708                 :  * caller should allocate enough space for "to"
                                709                 :  * len: length of from.
                                710                 :  * "from" not necessarily null terminated.
                                711                 :  */
                                712                 : static int
 7836 bruce                     713 UBC           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                714                 : {
 8053                           715               0 :     int         cnt = 0;
                                716                 : 
 8067 tgl                       717               0 :     while (len > 0 && *from)
                                718                 :     {
                                719               0 :         if (IS_LC1(*from) && len >= 2)
                                720                 :         {
 8986 bruce                     721               0 :             *to = *from++ << 16;
                                722               0 :             *to |= *from++;
                                723               0 :             len -= 2;
                                724                 :         }
 8067 tgl                       725               0 :         else if (IS_LCPRV1(*from) && len >= 3)
                                726                 :         {
 8986 bruce                     727               0 :             from++;
                                728               0 :             *to = *from++ << 16;
                                729               0 :             *to |= *from++;
                                730               0 :             len -= 3;
                                731                 :         }
 8067 tgl                       732               0 :         else if (IS_LC2(*from) && len >= 3)
                                733                 :         {
 8986 bruce                     734               0 :             *to = *from++ << 16;
                                735               0 :             *to |= *from++ << 8;
                                736               0 :             *to |= *from++;
                                737               0 :             len -= 3;
                                738                 :         }
 8067 tgl                       739               0 :         else if (IS_LCPRV2(*from) && len >= 4)
                                740                 :         {
 8986 bruce                     741               0 :             from++;
                                742               0 :             *to = *from++ << 16;
                                743               0 :             *to |= *from++ << 8;
                                744               0 :             *to |= *from++;
                                745               0 :             len -= 4;
                                746                 :         }
                                747                 :         else
                                748                 :         {                       /* assume ASCII */
                                749               0 :             *to = (unsigned char) *from++;
                                750               0 :             len--;
                                751                 :         }
                                752               0 :         to++;
 8260 ishii                     753               0 :         cnt++;
                                754                 :     }
 8986 bruce                     755               0 :     *to = 0;
 6315                           756               0 :     return cnt;
                                757                 : }
                                758                 : 
                                759                 : /*
                                760                 :  * convert pg_wchar to mule internal code
                                761                 :  * caller should allocate enough space for "to"
                                762                 :  * len: length of from.
                                763                 :  * "from" not necessarily null terminated.
                                764                 :  */
                                765                 : static int
 3931 rhaas                     766               0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
                                767                 : {
                                768               0 :     int         cnt = 0;
                                769                 : 
                                770               0 :     while (len > 0 && *from)
                                771                 :     {
                                772                 :         unsigned char lb;
                                773                 : 
                                774               0 :         lb = (*from >> 16) & 0xff;
                                775               0 :         if (IS_LC1(lb))
                                776                 :         {
                                777               0 :             *to++ = lb;
                                778               0 :             *to++ = *from & 0xff;
                                779               0 :             cnt += 2;
                                780                 :         }
                                781               0 :         else if (IS_LC2(lb))
                                782                 :         {
                                783               0 :             *to++ = lb;
                                784               0 :             *to++ = (*from >> 8) & 0xff;
                                785               0 :             *to++ = *from & 0xff;
                                786               0 :             cnt += 3;
                                787                 :         }
                                788               0 :         else if (IS_LCPRV1_A_RANGE(lb))
                                789                 :         {
                                790               0 :             *to++ = LCPRV1_A;
                                791               0 :             *to++ = lb;
                                792               0 :             *to++ = *from & 0xff;
                                793               0 :             cnt += 3;
                                794                 :         }
                                795               0 :         else if (IS_LCPRV1_B_RANGE(lb))
                                796                 :         {
                                797               0 :             *to++ = LCPRV1_B;
                                798               0 :             *to++ = lb;
                                799               0 :             *to++ = *from & 0xff;
                                800               0 :             cnt += 3;
                                801                 :         }
                                802               0 :         else if (IS_LCPRV2_A_RANGE(lb))
                                803                 :         {
                                804               0 :             *to++ = LCPRV2_A;
                                805               0 :             *to++ = lb;
                                806               0 :             *to++ = (*from >> 8) & 0xff;
                                807               0 :             *to++ = *from & 0xff;
                                808               0 :             cnt += 4;
                                809                 :         }
                                810               0 :         else if (IS_LCPRV2_B_RANGE(lb))
                                811                 :         {
                                812               0 :             *to++ = LCPRV2_B;
                                813               0 :             *to++ = lb;
                                814               0 :             *to++ = (*from >> 8) & 0xff;
                                815               0 :             *to++ = *from & 0xff;
                                816               0 :             cnt += 4;
                                817                 :         }
                                818                 :         else
                                819                 :         {
 3925 tgl                       820               0 :             *to++ = *from & 0xff;
 3931 rhaas                     821               0 :             cnt += 1;
                                822                 :         }
 3930                           823               0 :         from++;
 3931                           824               0 :         len--;
                                825                 :     }
                                826               0 :     *to = 0;
                                827               0 :     return cnt;
                                828                 : }
                                829                 : 
                                830                 : /* exported for direct use by conv.c */
                                831                 : int
 8986 bruce                     832 CBC        1476 : pg_mule_mblen(const unsigned char *s)
                                833                 : {
                                834                 :     int         len;
                                835                 : 
                                836            1476 :     if (IS_LC1(*s))
                                837             594 :         len = 2;
                                838             882 :     else if (IS_LCPRV1(*s))
 8986 bruce                     839 UBC           0 :         len = 3;
 8986 bruce                     840 CBC         882 :     else if (IS_LC2(*s))
                                841             855 :         len = 3;
                                842              27 :     else if (IS_LCPRV2(*s))
 8986 bruce                     843 UBC           0 :         len = 4;
                                844                 :     else
 6031 bruce                     845 CBC          27 :         len = 1;                /* assume ASCII */
 6315                           846            1476 :     return len;
                                847                 : }
                                848                 : 
                                849                 : static int
 6964 ishii                     850 UBC           0 : pg_mule_dsplen(const unsigned char *s)
                                851                 : {
                                852                 :     int         len;
                                853                 : 
                                854                 :     /*
                                855                 :      * Note: it's not really appropriate to assume that all multibyte charsets
                                856                 :      * are double-wide on screen.  But this seems an okay approximation for
                                857                 :      * the MULE charsets we currently support.
                                858                 :      */
                                859                 : 
 6167 tgl                       860               0 :     if (IS_LC1(*s))
                                861               0 :         len = 1;
                                862               0 :     else if (IS_LCPRV1(*s))
                                863               0 :         len = 1;
                                864               0 :     else if (IS_LC2(*s))
                                865               0 :         len = 2;
                                866               0 :     else if (IS_LCPRV2(*s))
                                867               0 :         len = 2;
                                868                 :     else
 6031 bruce                     869               0 :         len = 1;                /* assume ASCII */
                                870                 : 
 6167 tgl                       871               0 :     return len;
                                872                 : }
                                873                 : 
                                874                 : /*
                                875                 :  * ISO8859-1
                                876                 :  */
                                877                 : static int
 7836 bruce                     878 CBC         507 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
                                879                 : {
 8053                           880             507 :     int         cnt = 0;
                                881                 : 
 8067 tgl                       882           14193 :     while (len > 0 && *from)
                                883                 :     {
 8986 bruce                     884           13686 :         *to++ = *from++;
 8067 tgl                       885           13686 :         len--;
 8260 ishii                     886           13686 :         cnt++;
                                887                 :     }
 8986 bruce                     888             507 :     *to = 0;
 6315                           889             507 :     return cnt;
                                890                 : }
                                891                 : 
                                892                 : /*
                                893                 :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
                                894                 :  * high bits.
                                895                 :  * caller should allocate enough space for "to"
                                896                 :  * len: length of from.
                                897                 :  * "from" not necessarily null terminated.
                                898                 :  */
                                899                 : static int
 3931 rhaas                     900              51 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
                                901                 : {
                                902              51 :     int         cnt = 0;
                                903                 : 
                                904             396 :     while (len > 0 && *from)
                                905                 :     {
                                906             345 :         *to++ = *from++;
                                907             345 :         len--;
                                908             345 :         cnt++;
                                909                 :     }
                                910              51 :     *to = 0;
                                911              51 :     return cnt;
                                912                 : }
                                913                 : 
                                914                 : static int
 8986 bruce                     915            2158 : pg_latin1_mblen(const unsigned char *s)
                                916                 : {
 6315                           917            2158 :     return 1;
                                918                 : }
                                919                 : 
                                920                 : static int
 6964 ishii                     921             400 : pg_latin1_dsplen(const unsigned char *s)
                                922                 : {
 6267 bruce                     923             400 :     return pg_ascii_dsplen(s);
                                924                 : }
                                925                 : 
                                926                 : /*
                                927                 :  * SJIS
                                928                 :  */
                                929                 : static int
 8986                           930             486 : pg_sjis_mblen(const unsigned char *s)
                                931                 : {
                                932                 :     int         len;
                                933                 : 
                                934             486 :     if (*s >= 0xa1 && *s <= 0xdf)
 6031 bruce                     935 UBC           0 :         len = 1;                /* 1 byte kana? */
 6313 bruce                     936 CBC         486 :     else if (IS_HIGHBIT_SET(*s))
 6031                           937             432 :         len = 2;                /* kanji? */
                                938                 :     else
                                939              54 :         len = 1;                /* should be ASCII */
 6315                           940             486 :     return len;
                                941                 : }
                                942                 : 
                                943                 : static int
 6964 ishii                     944 UBC           0 : pg_sjis_dsplen(const unsigned char *s)
                                945                 : {
                                946                 :     int         len;
                                947                 : 
                                948               0 :     if (*s >= 0xa1 && *s <= 0xdf)
 6031 bruce                     949               0 :         len = 1;                /* 1 byte kana? */
 6313                           950               0 :     else if (IS_HIGHBIT_SET(*s))
 6031                           951               0 :         len = 2;                /* kanji? */
                                952                 :     else
 2118 tgl                       953               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
 6315 bruce                     954               0 :     return len;
                                955                 : }
                                956                 : 
                                957                 : /*
                                958                 :  * Big5
                                959                 :  */
                                960                 : static int
 8832 bruce                     961 CBC         234 : pg_big5_mblen(const unsigned char *s)
                                962                 : {
                                963                 :     int         len;
                                964                 : 
 6313                           965             234 :     if (IS_HIGHBIT_SET(*s))
 6031                           966             207 :         len = 2;                /* kanji? */
                                967                 :     else
                                968              27 :         len = 1;                /* should be ASCII */
 6315                           969             234 :     return len;
                                970                 : }
                                971                 : 
                                972                 : static int
 6964 ishii                     973 UBC           0 : pg_big5_dsplen(const unsigned char *s)
                                974                 : {
                                975                 :     int         len;
                                976                 : 
 6313 bruce                     977               0 :     if (IS_HIGHBIT_SET(*s))
 6031                           978               0 :         len = 2;                /* kanji? */
                                979                 :     else
 2118 tgl                       980               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
 6315 bruce                     981               0 :     return len;
                                982                 : }
                                983                 : 
                                984                 : /*
                                985                 :  * GBK
                                986                 :  */
                                987                 : static int
 7705                           988               0 : pg_gbk_mblen(const unsigned char *s)
                                989                 : {
                                990                 :     int         len;
                                991                 : 
 6313                           992               0 :     if (IS_HIGHBIT_SET(*s))
 6031                           993               0 :         len = 2;                /* kanji? */
                                994                 :     else
                                995               0 :         len = 1;                /* should be ASCII */
 6315                           996               0 :     return len;
                                997                 : }
                                998                 : 
                                999                 : static int
 6964 ishii                    1000               0 : pg_gbk_dsplen(const unsigned char *s)
                               1001                 : {
                               1002                 :     int         len;
                               1003                 : 
 6313 bruce                    1004               0 :     if (IS_HIGHBIT_SET(*s))
 6031                          1005               0 :         len = 2;                /* kanji? */
                               1006                 :     else
 2118 tgl                      1007               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
 6315 bruce                    1008               0 :     return len;
                               1009                 : }
                               1010                 : 
                               1011                 : /*
                               1012                 :  * UHC
                               1013                 :  */
                               1014                 : static int
 7705                          1015               0 : pg_uhc_mblen(const unsigned char *s)
                               1016                 : {
                               1017                 :     int         len;
                               1018                 : 
 6313                          1019               0 :     if (IS_HIGHBIT_SET(*s))
 6031                          1020               0 :         len = 2;                /* 2byte? */
                               1021                 :     else
                               1022               0 :         len = 1;                /* should be ASCII */
 6315                          1023               0 :     return len;
                               1024                 : }
                               1025                 : 
                               1026                 : static int
 6964 ishii                    1027               0 : pg_uhc_dsplen(const unsigned char *s)
                               1028                 : {
                               1029                 :     int         len;
                               1030                 : 
 6313 bruce                    1031               0 :     if (IS_HIGHBIT_SET(*s))
 6031                          1032               0 :         len = 2;                /* 2byte? */
                               1033                 :     else
 2118 tgl                      1034               0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
 6315 bruce                    1035               0 :     return len;
                               1036                 : }
                               1037                 : 
                               1038                 : /*
                               1039                 :  * GB18030
                               1040                 :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
                               1041                 :  */
                               1042                 : 
                               1043                 : /*
                               1044                 :  * Unlike all other mblen() functions, this also looks at the second byte of
                               1045                 :  * the input.  However, if you only pass the first byte of a multi-byte
                               1046                 :  * string, and \0 as the second byte, this still works in a predictable way:
                               1047                 :  * a 4-byte character will be reported as two 2-byte characters.  That's
                               1048                 :  * enough for all current uses, as a client-only encoding.  It works that
                               1049                 :  * way, because in any valid 4-byte GB18030-encoded character, the third and
                               1050                 :  * fourth byte look like a 2-byte encoded character, when looked at
                               1051                 :  * separately.
                               1052                 :  */
                               1053                 : static int
 7605 ishii                    1054 CBC          81 : pg_gb18030_mblen(const unsigned char *s)
                               1055                 : {
                               1056                 :     int         len;
                               1057                 : 
 6313 bruce                    1058              81 :     if (!IS_HIGHBIT_SET(*s))
 6031                          1059              18 :         len = 1;                /* ASCII */
 2886 tgl                      1060              63 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
                               1061              63 :         len = 4;
                               1062                 :     else
 2886 tgl                      1063 UBC           0 :         len = 2;
 6315 bruce                    1064 CBC          81 :     return len;
                               1065                 : }
                               1066                 : 
                               1067                 : static int
 6964 ishii                    1068 UBC           0 : pg_gb18030_dsplen(const unsigned char *s)
                               1069                 : {
                               1070                 :     int         len;
                               1071                 : 
 6267 bruce                    1072               0 :     if (IS_HIGHBIT_SET(*s))
 6964 ishii                    1073               0 :         len = 2;
                               1074                 :     else
 2118 tgl                      1075               0 :         len = pg_ascii_dsplen(s);   /* ASCII */
 6315 bruce                    1076               0 :     return len;
                               1077                 : }
                               1078                 : 
                               1079                 : /*
                               1080                 :  *-------------------------------------------------------------------
                               1081                 :  * multibyte sequence validators
                               1082                 :  *
                               1083                 :  * The verifychar functions accept "s", a pointer to the first byte of a
                               1084                 :  * string, and "len", the remaining length of the string.  If there is a
                               1085                 :  * validly encoded character beginning at *s, return its length in bytes;
                               1086                 :  * else return -1.
                               1087                 :  *
                               1088                 :  * The verifystr functions also accept "s", a pointer to a string and "len",
                               1089                 :  * the length of the string.  They verify the whole string, and return the
                               1090                 :  * number of input bytes (<= len) that are valid.  In other words, if the
                               1091                 :  * whole string is valid, verifystr returns "len", otherwise it returns the
                               1092                 :  * byte offset of the first invalid character.  The verifystr functions must
                               1093                 :  * test for and reject zeroes in the input.
                               1094                 :  *
                               1095                 :  * The verifychar functions can assume that len > 0 and that *s != '\0', but
                               1096                 :  * they must test for and reject zeroes in any additional bytes of a
                               1097                 :  * multibyte character.  Note that this definition allows the function for a
                               1098                 :  * single-byte encoding to be just "return 1".
                               1099                 :  *-------------------------------------------------------------------
                               1100                 :  */
                               1101                 : static int
  801 heikki.linnakangas       1102 CBC          28 : pg_ascii_verifychar(const unsigned char *s, int len)
                               1103                 : {
 6167 tgl                      1104              28 :     return 1;
                               1105                 : }
                               1106                 : 
                               1107                 : static int
  801 heikki.linnakangas       1108            2281 : pg_ascii_verifystr(const unsigned char *s, int len)
                               1109                 : {
                               1110            2281 :     const unsigned char *nullpos = memchr(s, 0, len);
                               1111                 : 
                               1112            2281 :     if (nullpos == NULL)
                               1113            2281 :         return len;
                               1114                 :     else
  801 heikki.linnakangas       1115 UBC           0 :         return nullpos - s;
                               1116                 : }
                               1117                 : 
                               1118                 : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
                               1119                 : 
                               1120                 : static int
  801 heikki.linnakangas       1121 CBC         216 : pg_eucjp_verifychar(const unsigned char *s, int len)
                               1122                 : {
                               1123                 :     int         l;
                               1124                 :     unsigned char c1,
                               1125                 :                 c2;
                               1126                 : 
 6167 tgl                      1127             216 :     c1 = *s++;
                               1128                 : 
                               1129             216 :     switch (c1)
                               1130                 :     {
 6031 bruce                    1131 UBC           0 :         case SS2:               /* JIS X 0201 */
 6167 tgl                      1132               0 :             l = 2;
                               1133               0 :             if (l > len)
                               1134               0 :                 return -1;
                               1135               0 :             c2 = *s++;
                               1136               0 :             if (c2 < 0xa1 || c2 > 0xdf)
                               1137               0 :                 return -1;
                               1138               0 :             break;
                               1139                 : 
 6031 bruce                    1140               0 :         case SS3:               /* JIS X 0212 */
 6167 tgl                      1141               0 :             l = 3;
                               1142               0 :             if (l > len)
                               1143               0 :                 return -1;
                               1144               0 :             c2 = *s++;
                               1145               0 :             if (!IS_EUC_RANGE_VALID(c2))
                               1146               0 :                 return -1;
                               1147               0 :             c2 = *s++;
                               1148               0 :             if (!IS_EUC_RANGE_VALID(c2))
                               1149               0 :                 return -1;
                               1150               0 :             break;
                               1151                 : 
 6167 tgl                      1152 CBC         216 :         default:
 2118                          1153             216 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
                               1154                 :             {
 6167                          1155             216 :                 l = 2;
                               1156             216 :                 if (l > len)
                               1157              36 :                     return -1;
                               1158             180 :                 if (!IS_EUC_RANGE_VALID(c1))
 6167 tgl                      1159 UBC           0 :                     return -1;
 6167 tgl                      1160 CBC         180 :                 c2 = *s++;
                               1161             180 :                 if (!IS_EUC_RANGE_VALID(c2))
                               1162              72 :                     return -1;
                               1163                 :             }
                               1164                 :             else
                               1165                 :                 /* must be ASCII */
                               1166                 :             {
 6167 tgl                      1167 UBC           0 :                 l = 1;
                               1168                 :             }
 6167 tgl                      1169 CBC         108 :             break;
                               1170                 :     }
                               1171                 : 
                               1172             108 :     return l;
                               1173                 : }
                               1174                 : 
                               1175                 : static int
  801 heikki.linnakangas       1176             132 : pg_eucjp_verifystr(const unsigned char *s, int len)
                               1177                 : {
                               1178             132 :     const unsigned char *start = s;
                               1179                 : 
                               1180             447 :     while (len > 0)
                               1181                 :     {
                               1182                 :         int         l;
                               1183                 : 
                               1184                 :         /* fast path for ASCII-subset characters */
                               1185             405 :         if (!IS_HIGHBIT_SET(*s))
                               1186                 :         {
                               1187             297 :             if (*s == '\0')
                               1188              36 :                 break;
                               1189             261 :             l = 1;
                               1190                 :         }
                               1191                 :         else
                               1192                 :         {
                               1193             108 :             l = pg_eucjp_verifychar(s, len);
                               1194             108 :             if (l == -1)
                               1195              54 :                 break;
                               1196                 :         }
                               1197             315 :         s += l;
                               1198             315 :         len -= l;
                               1199                 :     }
                               1200                 : 
                               1201             132 :     return s - start;
                               1202                 : }
                               1203                 : 
                               1204                 : static int
  801 heikki.linnakangas       1205 UBC           0 : pg_euckr_verifychar(const unsigned char *s, int len)
                               1206                 : {
                               1207                 :     int         l;
                               1208                 :     unsigned char c1,
                               1209                 :                 c2;
                               1210                 : 
 6167 tgl                      1211               0 :     c1 = *s++;
                               1212                 : 
                               1213               0 :     if (IS_HIGHBIT_SET(c1))
                               1214                 :     {
                               1215               0 :         l = 2;
                               1216               0 :         if (l > len)
                               1217               0 :             return -1;
                               1218               0 :         if (!IS_EUC_RANGE_VALID(c1))
                               1219               0 :             return -1;
                               1220               0 :         c2 = *s++;
                               1221               0 :         if (!IS_EUC_RANGE_VALID(c2))
                               1222               0 :             return -1;
                               1223                 :     }
                               1224                 :     else
                               1225                 :         /* must be ASCII */
                               1226                 :     {
                               1227               0 :         l = 1;
                               1228                 :     }
                               1229                 : 
                               1230               0 :     return l;
                               1231                 : }
                               1232                 : 
                               1233                 : static int
  801 heikki.linnakangas       1234 CBC          12 : pg_euckr_verifystr(const unsigned char *s, int len)
                               1235                 : {
                               1236              12 :     const unsigned char *start = s;
                               1237                 : 
                               1238              48 :     while (len > 0)
                               1239                 :     {
                               1240                 :         int         l;
                               1241                 : 
                               1242                 :         /* fast path for ASCII-subset characters */
                               1243              36 :         if (!IS_HIGHBIT_SET(*s))
                               1244                 :         {
                               1245              36 :             if (*s == '\0')
  801 heikki.linnakangas       1246 UBC           0 :                 break;
  801 heikki.linnakangas       1247 CBC          36 :             l = 1;
                               1248                 :         }
                               1249                 :         else
                               1250                 :         {
  801 heikki.linnakangas       1251 UBC           0 :             l = pg_euckr_verifychar(s, len);
                               1252               0 :             if (l == -1)
                               1253               0 :                 break;
                               1254                 :         }
  801 heikki.linnakangas       1255 CBC          36 :         s += l;
                               1256              36 :         len -= l;
                               1257                 :     }
                               1258                 : 
                               1259              12 :     return s - start;
                               1260                 : }
                               1261                 : 
                               1262                 : /* EUC-CN byte sequences are exactly same as EUC-KR */
                               1263                 : #define pg_euccn_verifychar pg_euckr_verifychar
                               1264                 : #define pg_euccn_verifystr  pg_euckr_verifystr
                               1265                 : 
                               1266                 : static int
  801 heikki.linnakangas       1267 UBC           0 : pg_euctw_verifychar(const unsigned char *s, int len)
                               1268                 : {
                               1269                 :     int         l;
                               1270                 :     unsigned char c1,
                               1271                 :                 c2;
                               1272                 : 
 6167 tgl                      1273               0 :     c1 = *s++;
                               1274                 : 
                               1275               0 :     switch (c1)
                               1276                 :     {
 6031 bruce                    1277               0 :         case SS2:               /* CNS 11643 Plane 1-7 */
 6167 tgl                      1278               0 :             l = 4;
                               1279               0 :             if (l > len)
                               1280               0 :                 return -1;
                               1281               0 :             c2 = *s++;
                               1282               0 :             if (c2 < 0xa1 || c2 > 0xa7)
                               1283               0 :                 return -1;
                               1284               0 :             c2 = *s++;
                               1285               0 :             if (!IS_EUC_RANGE_VALID(c2))
                               1286               0 :                 return -1;
                               1287               0 :             c2 = *s++;
                               1288               0 :             if (!IS_EUC_RANGE_VALID(c2))
                               1289               0 :                 return -1;
                               1290               0 :             break;
                               1291                 : 
 6031 bruce                    1292               0 :         case SS3:               /* unused */
 6167 tgl                      1293               0 :             return -1;
                               1294                 : 
                               1295               0 :         default:
 2118                          1296               0 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
                               1297                 :             {
 6167                          1298               0 :                 l = 2;
                               1299               0 :                 if (l > len)
                               1300               0 :                     return -1;
                               1301                 :                 /* no further range check on c1? */
                               1302               0 :                 c2 = *s++;
                               1303               0 :                 if (!IS_EUC_RANGE_VALID(c2))
                               1304               0 :                     return -1;
                               1305                 :             }
                               1306                 :             else
                               1307                 :                 /* must be ASCII */
                               1308                 :             {
                               1309               0 :                 l = 1;
                               1310                 :             }
                               1311               0 :             break;
                               1312                 :     }
                               1313               0 :     return l;
                               1314                 : }
                               1315                 : 
                               1316                 : static int
  801 heikki.linnakangas       1317 CBC           9 : pg_euctw_verifystr(const unsigned char *s, int len)
                               1318                 : {
                               1319               9 :     const unsigned char *start = s;
                               1320                 : 
                               1321              36 :     while (len > 0)
                               1322                 :     {
                               1323                 :         int         l;
                               1324                 : 
                               1325                 :         /* fast path for ASCII-subset characters */
                               1326              27 :         if (!IS_HIGHBIT_SET(*s))
                               1327                 :         {
                               1328              27 :             if (*s == '\0')
  801 heikki.linnakangas       1329 UBC           0 :                 break;
  801 heikki.linnakangas       1330 CBC          27 :             l = 1;
                               1331                 :         }
                               1332                 :         else
                               1333                 :         {
  801 heikki.linnakangas       1334 UBC           0 :             l = pg_euctw_verifychar(s, len);
                               1335               0 :             if (l == -1)
                               1336               0 :                 break;
                               1337                 :         }
  801 heikki.linnakangas       1338 CBC          27 :         s += l;
                               1339              27 :         len -= l;
                               1340                 :     }
                               1341                 : 
                               1342               9 :     return s - start;
                               1343                 : }
                               1344                 : 
                               1345                 : static int
  801 heikki.linnakangas       1346 UBC           0 : pg_johab_verifychar(const unsigned char *s, int len)
                               1347                 : {
                               1348                 :     int         l,
                               1349                 :                 mbl;
                               1350                 :     unsigned char c;
                               1351                 : 
 6167 tgl                      1352               0 :     l = mbl = pg_johab_mblen(s);
                               1353                 : 
                               1354               0 :     if (len < l)
                               1355               0 :         return -1;
                               1356                 : 
                               1357               0 :     if (!IS_HIGHBIT_SET(*s))
                               1358               0 :         return mbl;
                               1359                 : 
                               1360               0 :     while (--l > 0)
                               1361                 :     {
                               1362               0 :         c = *++s;
                               1363               0 :         if (!IS_EUC_RANGE_VALID(c))
                               1364               0 :             return -1;
                               1365                 :     }
                               1366               0 :     return mbl;
                               1367                 : }
                               1368                 : 
                               1369                 : static int
  801 heikki.linnakangas       1370 CBC           3 : pg_johab_verifystr(const unsigned char *s, int len)
                               1371                 : {
                               1372               3 :     const unsigned char *start = s;
                               1373                 : 
                               1374              12 :     while (len > 0)
                               1375                 :     {
                               1376                 :         int         l;
                               1377                 : 
                               1378                 :         /* fast path for ASCII-subset characters */
                               1379               9 :         if (!IS_HIGHBIT_SET(*s))
                               1380                 :         {
                               1381               9 :             if (*s == '\0')
  801 heikki.linnakangas       1382 UBC           0 :                 break;
  801 heikki.linnakangas       1383 CBC           9 :             l = 1;
                               1384                 :         }
                               1385                 :         else
                               1386                 :         {
  801 heikki.linnakangas       1387 UBC           0 :             l = pg_johab_verifychar(s, len);
                               1388               0 :             if (l == -1)
                               1389               0 :                 break;
                               1390                 :         }
  801 heikki.linnakangas       1391 CBC           9 :         s += l;
                               1392               9 :         len -= l;
                               1393                 :     }
                               1394                 : 
                               1395               3 :     return s - start;
                               1396                 : }
                               1397                 : 
                               1398                 : static int
                               1399             648 : pg_mule_verifychar(const unsigned char *s, int len)
                               1400                 : {
                               1401                 :     int         l,
                               1402                 :                 mbl;
                               1403                 :     unsigned char c;
                               1404                 : 
 6167 tgl                      1405             648 :     l = mbl = pg_mule_mblen(s);
                               1406                 : 
                               1407             648 :     if (len < l)
                               1408             162 :         return -1;
                               1409                 : 
                               1410             999 :     while (--l > 0)
                               1411                 :     {
                               1412             657 :         c = *++s;
                               1413             657 :         if (!IS_HIGHBIT_SET(c))
                               1414             144 :             return -1;
                               1415                 :     }
                               1416             342 :     return mbl;
                               1417                 : }
                               1418                 : 
                               1419                 : static int
  801 heikki.linnakangas       1420             189 : pg_mule_verifystr(const unsigned char *s, int len)
                               1421                 : {
                               1422             189 :     const unsigned char *start = s;
                               1423                 : 
                               1424             531 :     while (len > 0)
                               1425                 :     {
                               1426                 :         int         l;
                               1427                 : 
                               1428                 :         /* fast path for ASCII-subset characters */
                               1429             450 :         if (!IS_HIGHBIT_SET(*s))
                               1430                 :         {
                               1431             261 :             if (*s == '\0')
                               1432              18 :                 break;
                               1433             243 :             l = 1;
                               1434                 :         }
                               1435                 :         else
                               1436                 :         {
                               1437             189 :             l = pg_mule_verifychar(s, len);
                               1438             189 :             if (l == -1)
                               1439              90 :                 break;
                               1440                 :         }
                               1441             342 :         s += l;
                               1442             342 :         len -= l;
                               1443                 :     }
                               1444                 : 
                               1445             189 :     return s - start;
                               1446                 : }
                               1447                 : 
                               1448                 : static int
                               1449             167 : pg_latin1_verifychar(const unsigned char *s, int len)
                               1450                 : {
 6167 tgl                      1451             167 :     return 1;
                               1452                 : }
                               1453                 : 
                               1454                 : static int
  801 heikki.linnakangas       1455            5416 : pg_latin1_verifystr(const unsigned char *s, int len)
                               1456                 : {
                               1457            5416 :     const unsigned char *nullpos = memchr(s, 0, len);
                               1458                 : 
                               1459            5416 :     if (nullpos == NULL)
                               1460            5362 :         return len;
                               1461                 :     else
                               1462              54 :         return nullpos - s;
                               1463                 : }
                               1464                 : 
                               1465                 : static int
                               1466             351 : pg_sjis_verifychar(const unsigned char *s, int len)
                               1467                 : {
                               1468                 :     int         l,
                               1469                 :                 mbl;
                               1470                 :     unsigned char c1,
                               1471                 :                 c2;
                               1472                 : 
 6167 tgl                      1473             351 :     l = mbl = pg_sjis_mblen(s);
                               1474                 : 
                               1475             351 :     if (len < l)
                               1476              54 :         return -1;
                               1477                 : 
                               1478             297 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
 6167 tgl                      1479 UBC           0 :         return mbl;
                               1480                 : 
 6167 tgl                      1481 CBC         297 :     c1 = *s++;
                               1482             297 :     c2 = *s;
                               1483             297 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
                               1484             108 :         return -1;
                               1485             189 :     return mbl;
                               1486                 : }
                               1487                 : 
                               1488                 : static int
  801 heikki.linnakangas       1489             141 : pg_sjis_verifystr(const unsigned char *s, int len)
                               1490                 : {
                               1491             141 :     const unsigned char *start = s;
                               1492                 : 
                               1493             627 :     while (len > 0)
                               1494                 :     {
                               1495                 :         int         l;
                               1496                 : 
                               1497                 :         /* fast path for ASCII-subset characters */
                               1498             576 :         if (!IS_HIGHBIT_SET(*s))
                               1499                 :         {
                               1500             459 :             if (*s == '\0')
                               1501              36 :                 break;
                               1502             423 :             l = 1;
                               1503                 :         }
                               1504                 :         else
                               1505                 :         {
                               1506             117 :             l = pg_sjis_verifychar(s, len);
                               1507             117 :             if (l == -1)
                               1508              54 :                 break;
                               1509                 :         }
                               1510             486 :         s += l;
                               1511             486 :         len -= l;
                               1512                 :     }
                               1513                 : 
                               1514             141 :     return s - start;
                               1515                 : }
                               1516                 : 
                               1517                 : static int
                               1518             171 : pg_big5_verifychar(const unsigned char *s, int len)
                               1519                 : {
                               1520                 :     int         l,
                               1521                 :                 mbl;
                               1522                 : 
 6167 tgl                      1523             171 :     l = mbl = pg_big5_mblen(s);
                               1524                 : 
                               1525             171 :     if (len < l)
 6167 tgl                      1526 UBC           0 :         return -1;
                               1527                 : 
 6167 tgl                      1528 CBC         288 :     while (--l > 0)
                               1529                 :     {
                               1530             171 :         if (*++s == '\0')
                               1531              54 :             return -1;
                               1532                 :     }
                               1533                 : 
                               1534             117 :     return mbl;
                               1535                 : }
                               1536                 : 
                               1537                 : static int
  801 heikki.linnakangas       1538              72 : pg_big5_verifystr(const unsigned char *s, int len)
                               1539                 : {
                               1540              72 :     const unsigned char *start = s;
                               1541                 : 
                               1542             324 :     while (len > 0)
                               1543                 :     {
                               1544                 :         int         l;
                               1545                 : 
                               1546                 :         /* fast path for ASCII-subset characters */
                               1547             288 :         if (!IS_HIGHBIT_SET(*s))
                               1548                 :         {
                               1549             234 :             if (*s == '\0')
                               1550              18 :                 break;
                               1551             216 :             l = 1;
                               1552                 :         }
                               1553                 :         else
                               1554                 :         {
                               1555              54 :             l = pg_big5_verifychar(s, len);
                               1556              54 :             if (l == -1)
                               1557              18 :                 break;
                               1558                 :         }
                               1559             252 :         s += l;
                               1560             252 :         len -= l;
                               1561                 :     }
                               1562                 : 
                               1563              72 :     return s - start;
                               1564                 : }
                               1565                 : 
                               1566                 : static int
  801 heikki.linnakangas       1567 UBC           0 : pg_gbk_verifychar(const unsigned char *s, int len)
                               1568                 : {
                               1569                 :     int         l,
                               1570                 :                 mbl;
                               1571                 : 
 6167 tgl                      1572               0 :     l = mbl = pg_gbk_mblen(s);
                               1573                 : 
                               1574               0 :     if (len < l)
                               1575               0 :         return -1;
                               1576                 : 
                               1577               0 :     while (--l > 0)
                               1578                 :     {
                               1579               0 :         if (*++s == '\0')
                               1580               0 :             return -1;
                               1581                 :     }
                               1582                 : 
                               1583               0 :     return mbl;
                               1584                 : }
                               1585                 : 
                               1586                 : static int
  801 heikki.linnakangas       1587 CBC           3 : pg_gbk_verifystr(const unsigned char *s, int len)
                               1588                 : {
                               1589               3 :     const unsigned char *start = s;
                               1590                 : 
                               1591              12 :     while (len > 0)
                               1592                 :     {
                               1593                 :         int         l;
                               1594                 : 
                               1595                 :         /* fast path for ASCII-subset characters */
                               1596               9 :         if (!IS_HIGHBIT_SET(*s))
                               1597                 :         {
                               1598               9 :             if (*s == '\0')
  801 heikki.linnakangas       1599 UBC           0 :                 break;
  801 heikki.linnakangas       1600 CBC           9 :             l = 1;
                               1601                 :         }
                               1602                 :         else
                               1603                 :         {
  801 heikki.linnakangas       1604 UBC           0 :             l = pg_gbk_verifychar(s, len);
                               1605               0 :             if (l == -1)
                               1606               0 :                 break;
                               1607                 :         }
  801 heikki.linnakangas       1608 CBC           9 :         s += l;
                               1609               9 :         len -= l;
                               1610                 :     }
                               1611                 : 
                               1612               3 :     return s - start;
                               1613                 : }
                               1614                 : 
                               1615                 : static int
  801 heikki.linnakangas       1616 UBC           0 : pg_uhc_verifychar(const unsigned char *s, int len)
                               1617                 : {
                               1618                 :     int         l,
                               1619                 :                 mbl;
                               1620                 : 
 6167 tgl                      1621               0 :     l = mbl = pg_uhc_mblen(s);
                               1622                 : 
                               1623               0 :     if (len < l)
                               1624               0 :         return -1;
                               1625                 : 
                               1626               0 :     while (--l > 0)
                               1627                 :     {
                               1628               0 :         if (*++s == '\0')
                               1629               0 :             return -1;
                               1630                 :     }
                               1631                 : 
                               1632               0 :     return mbl;
                               1633                 : }
                               1634                 : 
                               1635                 : static int
  801 heikki.linnakangas       1636 CBC           3 : pg_uhc_verifystr(const unsigned char *s, int len)
                               1637                 : {
                               1638               3 :     const unsigned char *start = s;
                               1639                 : 
                               1640              12 :     while (len > 0)
                               1641                 :     {
                               1642                 :         int         l;
                               1643                 : 
                               1644                 :         /* fast path for ASCII-subset characters */
                               1645               9 :         if (!IS_HIGHBIT_SET(*s))
                               1646                 :         {
                               1647               9 :             if (*s == '\0')
  801 heikki.linnakangas       1648 UBC           0 :                 break;
  801 heikki.linnakangas       1649 CBC           9 :             l = 1;
                               1650                 :         }
                               1651                 :         else
                               1652                 :         {
  801 heikki.linnakangas       1653 UBC           0 :             l = pg_uhc_verifychar(s, len);
                               1654               0 :             if (l == -1)
                               1655               0 :                 break;
                               1656                 :         }
  801 heikki.linnakangas       1657 CBC           9 :         s += l;
                               1658               9 :         len -= l;
                               1659                 :     }
                               1660                 : 
                               1661               3 :     return s - start;
                               1662                 : }
                               1663                 : 
                               1664                 : static int
                               1665             207 : pg_gb18030_verifychar(const unsigned char *s, int len)
                               1666                 : {
                               1667                 :     int         l;
                               1668                 : 
 2886 tgl                      1669             207 :     if (!IS_HIGHBIT_SET(*s))
 2886 tgl                      1670 UBC           0 :         l = 1;                  /* ASCII */
 2886 tgl                      1671 CBC         207 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
                               1672                 :     {
                               1673                 :         /* Should be 4-byte, validate remaining bytes */
                               1674             153 :         if (*s >= 0x81 && *s <= 0xfe &&
                               1675             153 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
                               1676             153 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
                               1677              81 :             l = 4;
                               1678                 :         else
                               1679              72 :             l = -1;
                               1680                 :     }
                               1681              54 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
                               1682                 :     {
                               1683                 :         /* Should be 2-byte, validate */
                               1684              54 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
                               1685              54 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
                               1686              18 :             l = 2;
                               1687                 :         else
                               1688              36 :             l = -1;
                               1689                 :     }
                               1690                 :     else
 2886 tgl                      1691 UBC           0 :         l = -1;
 2886 tgl                      1692 CBC         207 :     return l;
                               1693                 : }
                               1694                 : 
                               1695                 : static int
  801 heikki.linnakangas       1696             111 : pg_gb18030_verifystr(const unsigned char *s, int len)
                               1697                 : {
                               1698             111 :     const unsigned char *start = s;
                               1699                 : 
                               1700             489 :     while (len > 0)
                               1701                 :     {
                               1702                 :         int         l;
                               1703                 : 
                               1704                 :         /* fast path for ASCII-subset characters */
                               1705             450 :         if (!IS_HIGHBIT_SET(*s))
                               1706                 :         {
                               1707             351 :             if (*s == '\0')
                               1708              18 :                 break;
                               1709             333 :             l = 1;
                               1710                 :         }
                               1711                 :         else
                               1712                 :         {
                               1713              99 :             l = pg_gb18030_verifychar(s, len);
                               1714              99 :             if (l == -1)
                               1715              54 :                 break;
                               1716                 :         }
                               1717             378 :         s += l;
                               1718             378 :         len -= l;
                               1719                 :     }
                               1720                 : 
                               1721             111 :     return s - start;
                               1722                 : }
                               1723                 : 
                               1724                 : static int
                               1725            5178 : pg_utf8_verifychar(const unsigned char *s, int len)
                               1726                 : {
                               1727                 :     int         l;
                               1728                 : 
                               1729            5178 :     if ((*s & 0x80) == 0)
                               1730                 :     {
  801 heikki.linnakangas       1731 UBC           0 :         if (*s == '\0')
                               1732               0 :             return -1;
                               1733               0 :         return 1;
                               1734                 :     }
  801 heikki.linnakangas       1735 CBC        5178 :     else if ((*s & 0xe0) == 0xc0)
                               1736            1918 :         l = 2;
                               1737            3260 :     else if ((*s & 0xf0) == 0xe0)
                               1738            2742 :         l = 3;
                               1739             518 :     else if ((*s & 0xf8) == 0xf0)
                               1740             386 :         l = 4;
                               1741                 :     else
                               1742             132 :         l = 1;
                               1743                 : 
                               1744            5178 :     if (l > len)
 6167 tgl                      1745              90 :         return -1;
                               1746                 : 
                               1747            5088 :     if (!pg_utf8_islegal(s, l))
                               1748             906 :         return -1;
                               1749                 : 
                               1750            4182 :     return l;
                               1751                 : }
                               1752                 : 
                               1753                 : /*
                               1754                 :  * The fast path of the UTF-8 verifier uses a deterministic finite automaton
                               1755                 :  * (DFA) for multibyte characters. In a traditional table-driven DFA, the
                               1756                 :  * input byte and current state are used to compute an index into an array of
                               1757                 :  * state transitions. Since the address of the next transition is dependent
                               1758                 :  * on this computation, there is latency in executing the load instruction,
                               1759                 :  * and the CPU is not kept busy.
                               1760                 :  *
                               1761                 :  * Instead, we use a "shift-based" DFA as described by Per Vognsen:
                               1762                 :  *
                               1763                 :  * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
                               1764                 :  *
                               1765                 :  * In a shift-based DFA, the input byte is an index into array of integers
                               1766                 :  * whose bit pattern encodes the state transitions. To compute the next
                               1767                 :  * state, we simply right-shift the integer by the current state and apply a
                               1768                 :  * mask. In this scheme, the address of the transition only depends on the
                               1769                 :  * input byte, so there is better pipelining.
                               1770                 :  *
                               1771                 :  * The naming convention for states and transitions was adopted from a UTF-8
                               1772                 :  * to UTF-16/32 transcoder, whose table is reproduced below:
                               1773                 :  *
                               1774                 :  * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
                               1775                 :  *
                               1776                 :  * ILL  ASC  CR1  CR2  CR3  L2A  L3A  L3B  L3C  L4A  L4B  L4C CLASS / STATE
                               1777                 :  * ==========================================================================
                               1778                 :  * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B,      | BGN/END
                               1779                 :  * err, err, err, err, err, err, err, err, err, err, err, err,      | ERR
                               1780                 :  *                                                                  |
                               1781                 :  * err, err, END, END, END, err, err, err, err, err, err, err,      | CS1
                               1782                 :  * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err,      | CS2
                               1783                 :  * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err,      | CS3
                               1784                 :  *                                                                  |
                               1785                 :  * err, err, err, err, CS1, err, err, err, err, err, err, err,      | P3A
                               1786                 :  * err, err, CS1, CS1, err, err, err, err, err, err, err, err,      | P3B
                               1787                 :  *                                                                  |
                               1788                 :  * err, err, err, CS2, CS2, err, err, err, err, err, err, err,      | P4A
                               1789                 :  * err, err, CS2, err, err, err, err, err, err, err, err, err,      | P4B
                               1790                 :  *
                               1791                 :  * In the most straightforward implementation, a shift-based DFA for UTF-8
                               1792                 :  * requires 64-bit integers to encode the transitions, but with an SMT solver
                               1793                 :  * it's possible to find state numbers such that the transitions fit within
                               1794                 :  * 32-bit integers, as Dougall Johnson demonstrated:
                               1795                 :  *
                               1796                 :  * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
                               1797                 :  *
                               1798                 :  * This packed representation is the reason for the seemingly odd choice of
                               1799                 :  * state values below.
                               1800                 :  */
                               1801                 : 
                               1802                 : /* Error */
                               1803                 : #define ERR  0
                               1804                 : /* Begin */
                               1805                 : #define BGN 11
                               1806                 : /* Continuation states, expect 1/2/3 continuation bytes */
                               1807                 : #define CS1 16
                               1808                 : #define CS2  1
                               1809                 : #define CS3  5
                               1810                 : /* Partial states, where the first continuation byte has a restricted range */
                               1811                 : #define P3A  6                  /* Lead was E0, check for 3-byte overlong */
                               1812                 : #define P3B 20                  /* Lead was ED, check for surrogate */
                               1813                 : #define P4A 25                  /* Lead was F0, check for 4-byte overlong */
                               1814                 : #define P4B 30                  /* Lead was F4, check for too-large */
                               1815                 : /* Begin and End are the same state */
                               1816                 : #define END BGN
                               1817                 : 
                               1818                 : /* the encoded state transitions for the lookup table */
                               1819                 : 
                               1820                 : /* ASCII */
                               1821                 : #define ASC (END << BGN)
                               1822                 : /* 2-byte lead */
                               1823                 : #define L2A (CS1 << BGN)
                               1824                 : /* 3-byte lead */
                               1825                 : #define L3A (P3A << BGN)
                               1826                 : #define L3B (CS2 << BGN)
                               1827                 : #define L3C (P3B << BGN)
                               1828                 : /* 4-byte lead */
                               1829                 : #define L4A (P4A << BGN)
                               1830                 : #define L4B (CS3 << BGN)
                               1831                 : #define L4C (P4B << BGN)
                               1832                 : /* continuation byte */
                               1833                 : #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
                               1834                 : #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
                               1835                 : #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
                               1836                 : /* invalid byte */
                               1837                 : #define ILL ERR
                               1838                 : 
                               1839                 : static const uint32 Utf8Transition[256] =
                               1840                 : {
                               1841                 :     /* ASCII */
                               1842                 : 
                               1843                 :     ILL, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1844                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1845                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1846                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1847                 : 
                               1848                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1849                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1850                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1851                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1852                 : 
                               1853                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1854                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1855                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1856                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1857                 : 
                               1858                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1859                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1860                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1861                 :     ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,
                               1862                 : 
                               1863                 :     /* continuation bytes */
                               1864                 : 
                               1865                 :     /* 80..8F */
                               1866                 :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
                               1867                 :     CR1, CR1, CR1, CR1, CR1, CR1, CR1, CR1,
                               1868                 : 
                               1869                 :     /* 90..9F */
                               1870                 :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
                               1871                 :     CR2, CR2, CR2, CR2, CR2, CR2, CR2, CR2,
                               1872                 : 
                               1873                 :     /* A0..BF */
                               1874                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
                               1875                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
                               1876                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
                               1877                 :     CR3, CR3, CR3, CR3, CR3, CR3, CR3, CR3,
                               1878                 : 
                               1879                 :     /* leading bytes */
                               1880                 : 
                               1881                 :     /* C0..DF */
                               1882                 :     ILL, ILL, L2A, L2A, L2A, L2A, L2A, L2A,
                               1883                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
                               1884                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
                               1885                 :     L2A, L2A, L2A, L2A, L2A, L2A, L2A, L2A,
                               1886                 : 
                               1887                 :     /* E0..EF */
                               1888                 :     L3A, L3B, L3B, L3B, L3B, L3B, L3B, L3B,
                               1889                 :     L3B, L3B, L3B, L3B, L3B, L3C, L3B, L3B,
                               1890                 : 
                               1891                 :     /* F0..FF */
                               1892                 :     L4A, L4B, L4B, L4B, L4C, ILL, ILL, ILL,
                               1893                 :     ILL, ILL, ILL, ILL, ILL, ILL, ILL, ILL
                               1894                 : };
                               1895                 : 
                               1896                 : static void
  537 john.naylor              1897             751 : utf8_advance(const unsigned char *s, uint32 *state, int len)
                               1898                 : {
                               1899                 :     /* Note: We deliberately don't check the state's value here. */
                               1900           24783 :     while (len > 0)
                               1901                 :     {
                               1902                 :         /*
                               1903                 :          * It's important that the mask value is 31: In most instruction sets,
                               1904                 :          * a shift by a 32-bit operand is understood to be a shift by its mod
                               1905                 :          * 32, so the compiler should elide the mask operation.
                               1906                 :          */
                               1907           24032 :         *state = Utf8Transition[*s++] >> (*state & 31);
                               1908           24032 :         len--;
                               1909                 :     }
                               1910                 : 
                               1911             751 :     *state &= 31;
                               1912             751 : }
                               1913                 : 
                               1914                 : static int
  801 heikki.linnakangas       1915         1263025 : pg_utf8_verifystr(const unsigned char *s, int len)
                               1916                 : {
                               1917         1263025 :     const unsigned char *start = s;
  537 john.naylor              1918         1263025 :     const int   orig_len = len;
                               1919         1263025 :     uint32      state = BGN;
                               1920                 : 
                               1921                 : /*
                               1922                 :  * With a stride of two vector widths, gcc will unroll the loop. Even if
                               1923                 :  * the compiler can unroll a longer loop, it's not worth it because we
                               1924                 :  * must fall back to the byte-wise algorithm if we find any non-ASCII.
                               1925                 :  */
                               1926                 : #define STRIDE_LENGTH (2 * sizeof(Vector8))
                               1927                 : 
  537 john.naylor              1928 GIC     1263025 :     if (len >= STRIDE_LENGTH)
  537 john.naylor              1929 ECB             :     {
  537 john.naylor              1930 GIC     4896864 :         while (len >= STRIDE_LENGTH)
  537 john.naylor              1931 ECB             :         {
                               1932                 :             /*
                               1933                 :              * If the chunk is all ASCII, we can skip the full UTF-8 check,
                               1934                 :              * but we must first check for a non-END state, which means the
                               1935                 :              * previous chunk ended in the middle of a multibyte sequence.
                               1936                 :              */
  537 john.naylor              1937 GIC     4159691 :             if (state != END || !is_valid_ascii(s, STRIDE_LENGTH))
  537 john.naylor              1938 CBC         751 :                 utf8_advance(s, &state, STRIDE_LENGTH);
  537 john.naylor              1939 ECB             : 
  537 john.naylor              1940 GIC     4159691 :             s += STRIDE_LENGTH;
  537 john.naylor              1941 CBC     4159691 :             len -= STRIDE_LENGTH;
  537 john.naylor              1942 ECB             :         }
                               1943                 : 
                               1944                 :         /* The error state persists, so we only need to check for it here. */
  537 john.naylor              1945 GIC      737173 :         if (state == ERR)
  537 john.naylor              1946 ECB             :         {
                               1947                 :             /*
                               1948                 :              * Start over from the beginning with the slow path so we can
                               1949                 :              * count the valid bytes.
                               1950                 :              */
  537 john.naylor              1951 GIC         252 :             len = orig_len;
  537 john.naylor              1952 CBC         252 :             s = start;
  537 john.naylor              1953 ECB             :         }
  447 john.naylor              1954 GIC      736921 :         else if (state != END)
  537 john.naylor              1955 ECB             :         {
                               1956                 :             /*
                               1957                 :              * The fast path exited in the middle of a multibyte sequence.
                               1958                 :              * Walk backwards to find the leading byte so that the slow path
                               1959                 :              * can resume checking from there. We must always backtrack at
                               1960                 :              * least one byte, since the current byte could be e.g. an ASCII
                               1961                 :              * byte after a 2-byte lead, which is invalid.
                               1962                 :              */
                               1963                 :             do
                               1964                 :             {
  447 john.naylor              1965 GIC          51 :                 Assert(s > start);
  447 john.naylor              1966 CBC          51 :                 s--;
                               1967              51 :                 len++;
                               1968              51 :                 Assert(IS_HIGHBIT_SET(*s));
                               1969              51 :             } while (pg_utf_mblen(s) <= 1);
  537 john.naylor              1970 ECB             :         }
                               1971                 :     }
                               1972                 : 
                               1973                 :     /* check remaining bytes */
  801 heikki.linnakangas       1974 GIC    15833150 :     while (len > 0)
  801 heikki.linnakangas       1975 ECB             :     {
                               1976                 :         int         l;
                               1977                 : 
                               1978                 :         /* fast path for ASCII-subset characters */
  801 heikki.linnakangas       1979 GIC    14571209 :         if (!IS_HIGHBIT_SET(*s))
  801 heikki.linnakangas       1980 ECB             :         {
  801 heikki.linnakangas       1981 GIC    14566031 :             if (*s == '\0')
  801 heikki.linnakangas       1982 CBC          88 :                 break;
                               1983        14565943 :             l = 1;
  801 heikki.linnakangas       1984 ECB             :         }
                               1985                 :         else
                               1986                 :         {
  801 heikki.linnakangas       1987 GIC        5178 :             l = pg_utf8_verifychar(s, len);
  801 heikki.linnakangas       1988 CBC        5178 :             if (l == -1)
                               1989             996 :                 break;
  801 heikki.linnakangas       1990 ECB             :         }
  801 heikki.linnakangas       1991 GIC    14570125 :         s += l;
  801 heikki.linnakangas       1992 CBC    14570125 :         len -= l;
  801 heikki.linnakangas       1993 ECB             :     }
                               1994                 : 
  801 heikki.linnakangas       1995 GIC     1263025 :     return s - start;
  801 heikki.linnakangas       1996 ECB             : }
                               1997                 : 
                               1998                 : /*
                               1999                 :  * Check for validity of a single UTF-8 encoded character
                               2000                 :  *
                               2001                 :  * This directly implements the rules in RFC3629.  The bizarre-looking
                               2002                 :  * restrictions on the second byte are meant to ensure that there isn't
                               2003                 :  * more than one encoding of a given Unicode character point; that is,
                               2004                 :  * you may not use a longer-than-necessary byte sequence with high order
                               2005                 :  * zero bits to represent a character that would fit in fewer bytes.
                               2006                 :  * To do otherwise is to create security hazards (eg, create an apparent
                               2007                 :  * non-ASCII character that decodes to plain ASCII).
                               2008                 :  *
                               2009                 :  * length is assumed to have been obtained by pg_utf_mblen(), and the
                               2010                 :  * caller must have checked that that many bytes are present in the buffer.
                               2011                 :  */
                               2012                 : bool
 6385 bruce                    2013 GIC        7892 : pg_utf8_islegal(const unsigned char *source, int length)
 6385 bruce                    2014 ECB             : {
                               2015                 :     unsigned char a;
                               2016                 : 
 6385 bruce                    2017 GIC        7892 :     switch (length)
 6385 bruce                    2018 ECB             :     {
 6385 bruce                    2019 UIC           0 :         default:
 6167 tgl                      2020 EUB             :             /* reject lengths 5 and 6 for now */
 6385 bruce                    2021 UIC           0 :             return false;
 6385 bruce                    2022 GBC         368 :         case 4:
 6167 tgl                      2023 CBC         368 :             a = source[3];
                               2024             368 :             if (a < 0x80 || a > 0xBF)
 6385 bruce                    2025              48 :                 return false;
 1061 alvherre                 2026 ECB             :             /* FALL THRU */
                               2027                 :         case 3:
 6167 tgl                      2028 GIC        3855 :             a = source[2];
 6167 tgl                      2029 CBC        3855 :             if (a < 0x80 || a > 0xBF)
 6385 bruce                    2030             300 :                 return false;
 1061 alvherre                 2031 ECB             :             /* FALL THRU */
                               2032                 :         case 2:
 6167 tgl                      2033 GIC        5733 :             a = source[1];
 6385 bruce                    2034 CBC        5733 :             switch (*source)
 6385 bruce                    2035 ECB             :             {
 6385 bruce                    2036 GIC         156 :                 case 0xE0:
 6167 tgl                      2037 CBC         156 :                     if (a < 0xA0 || a > 0xBF)
 6385 bruce                    2038             132 :                         return false;
                               2039              24 :                     break;
                               2040             156 :                 case 0xED:
 6167 tgl                      2041             156 :                     if (a < 0x80 || a > 0x9F)
 6385 bruce                    2042             132 :                         return false;
                               2043              24 :                     break;
                               2044             230 :                 case 0xF0:
 6167 tgl                      2045             230 :                     if (a < 0x90 || a > 0xBF)
 6385 bruce                    2046             132 :                         return false;
                               2047              98 :                     break;
                               2048              90 :                 case 0xF4:
 6167 tgl                      2049              90 :                     if (a < 0x80 || a > 0x8F)
 6385 bruce                    2050              66 :                         return false;
                               2051              24 :                     break;
                               2052            5101 :                 default:
 6167 tgl                      2053            5101 :                     if (a < 0x80 || a > 0xBF)
 6385 bruce                    2054              48 :                         return false;
 6167 tgl                      2055            5053 :                     break;
 6385 bruce                    2056 ECB             :             }
                               2057                 :             /* FALL THRU */
                               2058                 :         case 1:
 6167 tgl                      2059 GIC        7034 :             a = *source;
 6167 tgl                      2060 CBC        7034 :             if (a >= 0x80 && a < 0xC2)
                               2061             198 :                 return false;
                               2062            6836 :             if (a > 0xF4)
 6385 bruce                    2063              66 :                 return false;
 6167 tgl                      2064            6770 :             break;
 6385 bruce                    2065 ECB             :     }
 6385 bruce                    2066 GIC        6770 :     return true;
 6507 bruce                    2067 ECB             : }
                               2068                 : 
                               2069                 : 
                               2070                 : /*
                               2071                 :  *-------------------------------------------------------------------
                               2072                 :  * encoding info table
                               2073                 :  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
                               2074                 :  *-------------------------------------------------------------------
                               2075                 :  */
                               2076                 : const pg_wchar_tbl pg_wchar_table[] = {
                               2077                 :     {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},   /* PG_SQL_ASCII */
                               2078                 :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JP */
                               2079                 :     {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},  /* PG_EUC_CN */
                               2080                 :     {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},  /* PG_EUC_KR */
                               2081                 :     {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},  /* PG_EUC_TW */
                               2082                 :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},  /* PG_EUC_JIS_2004 */
                               2083                 :     {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},  /* PG_UTF8 */
                               2084                 :     {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},  /* PG_MULE_INTERNAL */
                               2085                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN1 */
                               2086                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN2 */
                               2087                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN3 */
                               2088                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN4 */
                               2089                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN5 */
                               2090                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN6 */
                               2091                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN7 */
                               2092                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN8 */
                               2093                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN9 */
                               2094                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_LATIN10 */
                               2095                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1256 */
                               2096                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1258 */
                               2097                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN866 */
                               2098                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN874 */
                               2099                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8R */
                               2100                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1251 */
                               2101                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1252 */
                               2102                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-5 */
                               2103                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-6 */
                               2104                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-7 */
                               2105                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* ISO-8859-8 */
                               2106                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1250 */
                               2107                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1253 */
                               2108                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1254 */
                               2109                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1255 */
                               2110                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_WIN1257 */
                               2111                 :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},  /* PG_KOI8U */
                               2112                 :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},    /* PG_SJIS */
                               2113                 :     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},    /* PG_BIG5 */
                               2114                 :     {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},    /* PG_GBK */
                               2115                 :     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},    /* PG_UHC */
                               2116                 :     {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},    /* PG_GB18030 */
                               2117                 :     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},    /* PG_JOHAB */
                               2118                 :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2} /* PG_SHIFT_JIS_2004 */
                               2119                 : };
                               2120                 : 
                               2121                 : /*
                               2122                 :  * Returns the byte length of a multibyte character.
                               2123                 :  *
                               2124                 :  * Caution: when dealing with text that is not certainly valid in the
                               2125                 :  * specified encoding, the result may exceed the actual remaining
                               2126                 :  * string length.  Callers that are not prepared to deal with that
                               2127                 :  * should use pg_encoding_mblen_bounded() instead.
                               2128                 :  */
                               2129                 : int
 6167 tgl                      2130 GIC    24328982 : pg_encoding_mblen(int encoding, const char *mbstr)
 6167 tgl                      2131 ECB             : {
 3303 bruce                    2132 GIC    24328982 :     return (PG_VALID_ENCODING(encoding) ?
 2040 peter_e                  2133 CBC    48657964 :             pg_wchar_table[encoding].mblen((const unsigned char *) mbstr) :
 2040 peter_e                  2134 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
 6167 tgl                      2135 EUB             : }
                               2136                 : 
                               2137                 : /*
                               2138                 :  * Returns the byte length of a multibyte character; but not more than
                               2139                 :  * the distance to end of string.
                               2140                 :  */
                               2141                 : int
  671 tgl                      2142 GIC          60 : pg_encoding_mblen_bounded(int encoding, const char *mbstr)
  671 tgl                      2143 ECB             : {
  671 tgl                      2144 GIC          60 :     return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
  671 tgl                      2145 ECB             : }
                               2146                 : 
                               2147                 : /*
                               2148                 :  * Returns the display length of a multibyte character.
                               2149                 :  */
                               2150                 : int
 6167 tgl                      2151 GIC    24249034 : pg_encoding_dsplen(int encoding, const char *mbstr)
 6167 tgl                      2152 ECB             : {
 3303 bruce                    2153 GIC    24249034 :     return (PG_VALID_ENCODING(encoding) ?
 2040 peter_e                  2154 CBC    48498068 :             pg_wchar_table[encoding].dsplen((const unsigned char *) mbstr) :
 2040 peter_e                  2155 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].dsplen((const unsigned char *) mbstr));
 6167 tgl                      2156 EUB             : }
                               2157                 : 
                               2158                 : /*
                               2159                 :  * Verify the first multibyte character of the given string.
                               2160                 :  * Return its byte length if good, -1 if bad.  (See comments above for
                               2161                 :  * full details of the mbverifychar API.)
                               2162                 :  */
                               2163                 : int
  801 heikki.linnakangas       2164 GIC        1170 : pg_encoding_verifymbchar(int encoding, const char *mbstr, int len)
  801 heikki.linnakangas       2165 ECB             : {
  801 heikki.linnakangas       2166 GIC        1170 :     return (PG_VALID_ENCODING(encoding) ?
  801 heikki.linnakangas       2167 CBC        2340 :             pg_wchar_table[encoding].mbverifychar((const unsigned char *) mbstr, len) :
  801 heikki.linnakangas       2168 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifychar((const unsigned char *) mbstr, len));
  801 heikki.linnakangas       2169 EUB             : }
                               2170                 : 
                               2171                 : /*
                               2172                 :  * Verify that a string is valid for the given encoding.
                               2173                 :  * Returns the number of input bytes (<= len) that form a valid string.
                               2174                 :  * (See comments above for full details of the mbverifystr API.)
                               2175                 :  */
                               2176                 : int
  801 heikki.linnakangas       2177 GIC      216868 : pg_encoding_verifymbstr(int encoding, const char *mbstr, int len)
 6167 tgl                      2178 ECB             : {
 3303 bruce                    2179 GIC      216868 :     return (PG_VALID_ENCODING(encoding) ?
  801 heikki.linnakangas       2180 CBC      433736 :             pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len) :
  801 heikki.linnakangas       2181 LBC           0 :             pg_wchar_table[PG_SQL_ASCII].mbverifystr((const unsigned char *) mbstr, len));
 6167 tgl                      2182 EUB             : }
                               2183                 : 
                               2184                 : /*
                               2185                 :  * fetch maximum length of a given encoding
                               2186                 :  */
                               2187                 : int
 6167 tgl                      2188 GIC      500733 : pg_encoding_max_length(int encoding)
 6167 tgl                      2189 ECB             : {
 6167 tgl                      2190 GIC      500733 :     Assert(PG_VALID_ENCODING(encoding));
 6167 tgl                      2191 ECB             : 
 6167 tgl                      2192 GIC      500733 :     return pg_wchar_table[encoding].maxmblen;
 6167 tgl                      2193 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a