LCOV - differential code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 84.6 % 325 275 50 275
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 11 11 11
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  *    Utility functions for conversion procs.
       4                 :  *
       5                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       6                 :  * Portions Copyright (c) 1994, Regents of the University of California
       7                 :  *
       8                 :  * IDENTIFICATION
       9                 :  *    src/backend/utils/mb/conv.c
      10                 :  *
      11                 :  *-------------------------------------------------------------------------
      12                 :  */
      13                 : #include "postgres.h"
      14                 : #include "mb/pg_wchar.h"
      15                 : 
      16                 : 
      17                 : /*
      18                 :  * local2local: a generic single byte charset encoding
      19                 :  * conversion between two ASCII-superset encodings.
      20                 :  *
      21                 :  * l points to the source string of length len
      22                 :  * p is the output area (must be large enough!)
      23                 :  * src_encoding is the PG identifier for the source encoding
      24                 :  * dest_encoding is the PG identifier for the target encoding
      25                 :  * tab holds conversion entries for the source charset
      26                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
      27                 :  * code point for the target charset, or 0 if there is no equivalent code.
      28                 :  *
      29                 :  * Returns the number of input bytes consumed.  If noError is true, this can
      30                 :  * be less than 'len'.
      31                 :  */
      32                 : int
      33 CBC         114 : local2local(const unsigned char *l,
      34                 :             unsigned char *p,
      35                 :             int len,
      36                 :             int src_encoding,
      37                 :             int dest_encoding,
      38                 :             const unsigned char *tab,
      39                 :             bool noError)
      40                 : {
      41             114 :     const unsigned char *start = l;
      42                 :     unsigned char c1,
      43                 :                 c2;
      44                 : 
      45             366 :     while (len > 0)
      46                 :     {
      47             306 :         c1 = *l;
      48             306 :         if (c1 == 0)
      49                 :         {
      50              54 :             if (noError)
      51              27 :                 break;
      52              27 :             report_invalid_encoding(src_encoding, (const char *) l, len);
      53                 :         }
      54             252 :         if (!IS_HIGHBIT_SET(c1))
      55             153 :             *p++ = c1;
      56                 :         else
      57                 :         {
      58              99 :             c2 = tab[c1 - HIGHBIT];
      59              99 :             if (c2)
      60              99 :                 *p++ = c2;
      61                 :             else
      62                 :             {
      63 UBC           0 :                 if (noError)
      64               0 :                     break;
      65               0 :                 report_untranslatable_char(src_encoding, dest_encoding,
      66                 :                                            (const char *) l, len);
      67                 :             }
      68                 :         }
      69 CBC         252 :         l++;
      70             252 :         len--;
      71                 :     }
      72              87 :     *p = '\0';
      73                 : 
      74              87 :     return l - start;
      75                 : }
      76                 : 
      77                 : /*
      78                 :  * LATINn ---> MIC when the charset's local codes map directly to MIC
      79                 :  *
      80                 :  * l points to the source string of length len
      81                 :  * p is the output area (must be large enough!)
      82                 :  * lc is the mule character set id for the local encoding
      83                 :  * encoding is the PG identifier for the local encoding
      84                 :  *
      85                 :  * Returns the number of input bytes consumed.  If noError is true, this can
      86                 :  * be less than 'len'.
      87                 :  */
      88                 : int
      89              15 : latin2mic(const unsigned char *l, unsigned char *p, int len,
      90                 :           int lc, int encoding, bool noError)
      91                 : {
      92              15 :     const unsigned char *start = l;
      93                 :     int         c1;
      94                 : 
      95              60 :     while (len > 0)
      96                 :     {
      97              45 :         c1 = *l;
      98              45 :         if (c1 == 0)
      99                 :         {
     100 UBC           0 :             if (noError)
     101               0 :                 break;
     102               0 :             report_invalid_encoding(encoding, (const char *) l, len);
     103                 :         }
     104 CBC          45 :         if (IS_HIGHBIT_SET(c1))
     105 UBC           0 :             *p++ = lc;
     106 CBC          45 :         *p++ = c1;
     107              45 :         l++;
     108              45 :         len--;
     109                 :     }
     110              15 :     *p = '\0';
     111                 : 
     112              15 :     return l - start;
     113                 : }
     114                 : 
     115                 : /*
     116                 :  * MIC ---> LATINn when the charset's local codes map directly to MIC
     117                 :  *
     118                 :  * mic points to the source string of length len
     119                 :  * p is the output area (must be large enough!)
     120                 :  * lc is the mule character set id for the local encoding
     121                 :  * encoding is the PG identifier for the local encoding
     122                 :  *
     123                 :  * Returns the number of input bytes consumed.  If noError is true, this can
     124                 :  * be less than 'len'.
     125                 :  */
     126                 : int
     127             177 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
     128                 :           int lc, int encoding, bool noError)
     129                 : {
     130             177 :     const unsigned char *start = mic;
     131                 :     int         c1;
     132                 : 
     133             420 :     while (len > 0)
     134                 :     {
     135             387 :         c1 = *mic;
     136             387 :         if (c1 == 0)
     137                 :         {
     138 UBC           0 :             if (noError)
     139               0 :                 break;
     140               0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     141                 :         }
     142 CBC         387 :         if (!IS_HIGHBIT_SET(c1))
     143                 :         {
     144                 :             /* easy for ASCII */
     145             180 :             *p++ = c1;
     146             180 :             mic++;
     147             180 :             len--;
     148                 :         }
     149                 :         else
     150                 :         {
     151             207 :             int         l = pg_mule_mblen(mic);
     152                 : 
     153             207 :             if (len < l)
     154                 :             {
     155              54 :                 if (noError)
     156              27 :                     break;
     157              27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     158                 :                                         len);
     159                 :             }
     160             153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
     161                 :             {
     162              90 :                 if (noError)
     163              45 :                     break;
     164              45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     165                 :                                            (const char *) mic, len);
     166                 :             }
     167              63 :             *p++ = mic[1];
     168              63 :             mic += 2;
     169              63 :             len -= 2;
     170                 :         }
     171                 :     }
     172             105 :     *p = '\0';
     173                 : 
     174             105 :     return mic - start;
     175                 : }
     176                 : 
     177                 : 
     178                 : /*
     179                 :  * latin2mic_with_table: a generic single byte charset encoding
     180                 :  * conversion from a local charset to the mule internal code.
     181                 :  *
     182                 :  * l points to the source string of length len
     183                 :  * p is the output area (must be large enough!)
     184                 :  * lc is the mule character set id for the local encoding
     185                 :  * encoding is the PG identifier for the local encoding
     186                 :  * tab holds conversion entries for the local charset
     187                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
     188                 :  * code point for the mule encoding, or 0 if there is no equivalent code.
     189                 :  *
     190                 :  * Returns the number of input bytes consumed.  If noError is true, this can
     191                 :  * be less than 'len'.
     192                 :  */
     193                 : int
     194              84 : latin2mic_with_table(const unsigned char *l,
     195                 :                      unsigned char *p,
     196                 :                      int len,
     197                 :                      int lc,
     198                 :                      int encoding,
     199                 :                      const unsigned char *tab,
     200                 :                      bool noError)
     201                 : {
     202              84 :     const unsigned char *start = l;
     203                 :     unsigned char c1,
     204                 :                 c2;
     205                 : 
     206             246 :     while (len > 0)
     207                 :     {
     208             216 :         c1 = *l;
     209             216 :         if (c1 == 0)
     210                 :         {
     211              54 :             if (noError)
     212              27 :                 break;
     213              27 :             report_invalid_encoding(encoding, (const char *) l, len);
     214                 :         }
     215             162 :         if (!IS_HIGHBIT_SET(c1))
     216              63 :             *p++ = c1;
     217                 :         else
     218                 :         {
     219              99 :             c2 = tab[c1 - HIGHBIT];
     220              99 :             if (c2)
     221                 :             {
     222              99 :                 *p++ = lc;
     223              99 :                 *p++ = c2;
     224                 :             }
     225                 :             else
     226                 :             {
     227 UBC           0 :                 if (noError)
     228               0 :                     break;
     229               0 :                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
     230                 :                                            (const char *) l, len);
     231                 :             }
     232                 :         }
     233 CBC         162 :         l++;
     234             162 :         len--;
     235                 :     }
     236              57 :     *p = '\0';
     237                 : 
     238              57 :     return l - start;
     239                 : }
     240                 : 
     241                 : /*
     242                 :  * mic2latin_with_table: a generic single byte charset encoding
     243                 :  * conversion from the mule internal code to a local charset.
     244                 :  *
     245                 :  * mic points to the source string of length len
     246                 :  * p is the output area (must be large enough!)
     247                 :  * lc is the mule character set id for the local encoding
     248                 :  * encoding is the PG identifier for the local encoding
     249                 :  * tab holds conversion entries for the mule internal code's second byte,
     250                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
     251                 :  * code point for the local charset, or 0 if there is no equivalent code.
     252                 :  *
     253                 :  * Returns the number of input bytes consumed.  If noError is true, this can
     254                 :  * be less than 'len'.
     255                 :  */
     256                 : int
     257             174 : mic2latin_with_table(const unsigned char *mic,
     258                 :                      unsigned char *p,
     259                 :                      int len,
     260                 :                      int lc,
     261                 :                      int encoding,
     262                 :                      const unsigned char *tab,
     263                 :                      bool noError)
     264                 : {
     265             174 :     const unsigned char *start = mic;
     266                 :     unsigned char c1,
     267                 :                 c2;
     268                 : 
     269             408 :     while (len > 0)
     270                 :     {
     271             378 :         c1 = *mic;
     272             378 :         if (c1 == 0)
     273                 :         {
     274 UBC           0 :             if (noError)
     275               0 :                 break;
     276               0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
     277                 :         }
     278 CBC         378 :         if (!IS_HIGHBIT_SET(c1))
     279                 :         {
     280                 :             /* easy for ASCII */
     281             171 :             *p++ = c1;
     282             171 :             mic++;
     283             171 :             len--;
     284                 :         }
     285                 :         else
     286                 :         {
     287             207 :             int         l = pg_mule_mblen(mic);
     288                 : 
     289             207 :             if (len < l)
     290                 :             {
     291              54 :                 if (noError)
     292              27 :                     break;
     293              27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
     294                 :                                         len);
     295                 :             }
     296             153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
     297              63 :                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
     298                 :             {
     299              90 :                 if (noError)
     300              45 :                     break;
     301              45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
     302                 :                                            (const char *) mic, len);
     303                 :                 break;          /* keep compiler quiet */
     304                 :             }
     305              63 :             *p++ = c2;
     306              63 :             mic += 2;
     307              63 :             len -= 2;
     308                 :         }
     309                 :     }
     310             102 :     *p = '\0';
     311                 : 
     312             102 :     return mic - start;
     313                 : }
     314                 : 
     315                 : /*
     316                 :  * comparison routine for bsearch()
     317                 :  * this routine is intended for combined UTF8 -> local code
     318                 :  */
     319                 : static int
     320             234 : compare3(const void *p1, const void *p2)
     321                 : {
     322                 :     uint32      s1,
     323                 :                 s2,
     324                 :                 d1,
     325                 :                 d2;
     326                 : 
     327             234 :     s1 = *(const uint32 *) p1;
     328             234 :     s2 = *((const uint32 *) p1 + 1);
     329             234 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
     330             234 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
     331             234 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
     332                 : }
     333                 : 
     334                 : /*
     335                 :  * comparison routine for bsearch()
     336                 :  * this routine is intended for local code -> combined UTF8
     337                 :  */
     338                 : static int
     339              81 : compare4(const void *p1, const void *p2)
     340                 : {
     341                 :     uint32      v1,
     342                 :                 v2;
     343                 : 
     344              81 :     v1 = *(const uint32 *) p1;
     345              81 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
     346              81 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
     347                 : }
     348                 : 
     349                 : /*
     350                 :  * store 32bit character representation into multibyte stream
     351                 :  */
     352                 : static inline unsigned char *
     353             612 : store_coded_char(unsigned char *dest, uint32 code)
     354                 : {
     355             612 :     if (code & 0xff000000)
     356              63 :         *dest++ = code >> 24;
     357             612 :     if (code & 0x00ff0000)
     358             261 :         *dest++ = code >> 16;
     359             612 :     if (code & 0x0000ff00)
     360             549 :         *dest++ = code >> 8;
     361             612 :     if (code & 0x000000ff)
     362             612 :         *dest++ = code;
     363             612 :     return dest;
     364                 : }
     365                 : 
     366                 : /*
     367                 :  * Convert a character using a conversion radix tree.
     368                 :  *
     369                 :  * 'l' is the length of the input character in bytes, and b1-b4 are
     370                 :  * the input character's bytes.
     371                 :  */
     372                 : static inline uint32
     373            1035 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
     374                 :                  int l,
     375                 :                  unsigned char b1,
     376                 :                  unsigned char b2,
     377                 :                  unsigned char b3,
     378                 :                  unsigned char b4)
     379                 : {
     380            1035 :     if (l == 4)
     381                 :     {
     382                 :         /* 4-byte code */
     383                 : 
     384                 :         /* check code validity */
     385              45 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
     386              45 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
     387              45 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
     388              45 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
     389 UBC           0 :             return 0;
     390                 : 
     391                 :         /* perform lookup */
     392 CBC          45 :         if (rt->chars32)
     393                 :         {
     394              45 :             uint32      idx = rt->b4root;
     395                 : 
     396              45 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
     397              45 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
     398              45 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
     399              45 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
     400                 :         }
     401                 :         else
     402                 :         {
     403 UBC           0 :             uint16      idx = rt->b4root;
     404                 : 
     405               0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
     406               0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
     407               0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
     408               0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
     409                 :         }
     410                 :     }
     411 CBC         990 :     else if (l == 3)
     412                 :     {
     413                 :         /* 3-byte code */
     414                 : 
     415                 :         /* check code validity */
     416             468 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
     417             144 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
     418             144 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
     419             324 :             return 0;
     420                 : 
     421                 :         /* perform lookup */
     422             144 :         if (rt->chars32)
     423                 :         {
     424             144 :             uint32      idx = rt->b3root;
     425                 : 
     426             144 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
     427             144 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
     428             144 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
     429                 :         }
     430                 :         else
     431                 :         {
     432 UBC           0 :             uint16      idx = rt->b3root;
     433                 : 
     434               0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
     435               0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
     436               0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
     437                 :         }
     438                 :     }
     439 CBC         522 :     else if (l == 2)
     440                 :     {
     441                 :         /* 2-byte code */
     442                 : 
     443                 :         /* check code validity - first byte */
     444             378 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
     445             342 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
     446              36 :             return 0;
     447                 : 
     448                 :         /* perform lookup */
     449             342 :         if (rt->chars32)
     450                 :         {
     451             261 :             uint32      idx = rt->b2root;
     452                 : 
     453             261 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
     454             261 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
     455                 :         }
     456                 :         else
     457                 :         {
     458              81 :             uint16      idx = rt->b2root;
     459                 : 
     460              81 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
     461              81 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
     462                 :         }
     463                 :     }
     464             144 :     else if (l == 1)
     465                 :     {
     466                 :         /* 1-byte code */
     467                 : 
     468                 :         /* check code validity - first byte */
     469             144 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
     470 UBC           0 :             return 0;
     471                 : 
     472                 :         /* perform lookup */
     473 CBC         144 :         if (rt->chars32)
     474              99 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
     475                 :         else
     476              45 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
     477                 :     }
     478 UBC           0 :     return 0;                   /* shouldn't happen */
     479                 : }
     480                 : 
     481                 : /*
     482                 :  * UTF8 ---> local code
     483                 :  *
     484                 :  * utf: input string in UTF8 encoding (need not be null-terminated)
     485                 :  * len: length of input string (in bytes)
     486                 :  * iso: pointer to the output area (must be large enough!)
     487                 :           (output string will be null-terminated)
     488                 :  * map: conversion map for single characters
     489                 :  * cmap: conversion map for combined characters
     490                 :  *        (optional, pass NULL if none)
     491                 :  * cmapsize: number of entries in the conversion map for combined characters
     492                 :  *        (optional, pass 0 if none)
     493                 :  * conv_func: algorithmic encoding conversion function
     494                 :  *        (optional, pass NULL if none)
     495                 :  * encoding: PG identifier for the local encoding
     496                 :  *
     497                 :  * For each character, the cmap (if provided) is consulted first; if no match,
     498                 :  * the map is consulted next; if still no match, the conv_func (if provided)
     499                 :  * is applied.  An error is raised if no match is found.
     500                 :  *
     501                 :  * See pg_wchar.h for more details about the data structures used here.
     502                 :  *
     503                 :  * Returns the number of input bytes consumed.  If noError is true, this can
     504                 :  * be less than 'len'.
     505                 :  */
     506                 : int
     507 CBC        1104 : UtfToLocal(const unsigned char *utf, int len,
     508                 :            unsigned char *iso,
     509                 :            const pg_mb_radix_tree *map,
     510                 :            const pg_utf_to_local_combined *cmap, int cmapsize,
     511                 :            utf_local_conversion_func conv_func,
     512                 :            int encoding, bool noError)
     513                 : {
     514                 :     uint32      iutf;
     515                 :     int         l;
     516                 :     const pg_utf_to_local_combined *cp;
     517            1104 :     const unsigned char *start = utf;
     518                 : 
     519            1104 :     if (!PG_VALID_ENCODING(encoding))
     520 UBC           0 :         ereport(ERROR,
     521                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     522                 :                  errmsg("invalid encoding number: %d", encoding)));
     523                 : 
     524 CBC        3030 :     for (; len > 0; len -= l)
     525                 :     {
     526            2736 :         unsigned char b1 = 0;
     527            2736 :         unsigned char b2 = 0;
     528            2736 :         unsigned char b3 = 0;
     529            2736 :         unsigned char b4 = 0;
     530                 : 
     531                 :         /* "break" cases all represent errors */
     532            2736 :         if (*utf == '\0')
     533              90 :             break;
     534                 : 
     535            2646 :         l = pg_utf_mblen(utf);
     536            2646 :         if (len < l)
     537             108 :             break;
     538                 : 
     539            2538 :         if (!pg_utf8_islegal(utf, l))
     540             180 :             break;
     541                 : 
     542            2358 :         if (l == 1)
     543                 :         {
     544                 :             /* ASCII case is easy, assume it's one-to-one conversion */
     545            1656 :             *iso++ = *utf++;
     546            1656 :             continue;
     547                 :         }
     548                 : 
     549                 :         /* collect coded char of length l */
     550             702 :         if (l == 2)
     551                 :         {
     552             207 :             b3 = *utf++;
     553             207 :             b4 = *utf++;
     554                 :         }
     555             495 :         else if (l == 3)
     556                 :         {
     557             495 :             b2 = *utf++;
     558             495 :             b3 = *utf++;
     559             495 :             b4 = *utf++;
     560                 :         }
     561 UBC           0 :         else if (l == 4)
     562                 :         {
     563               0 :             b1 = *utf++;
     564               0 :             b2 = *utf++;
     565               0 :             b3 = *utf++;
     566               0 :             b4 = *utf++;
     567                 :         }
     568                 :         else
     569                 :         {
     570               0 :             elog(ERROR, "unsupported character length %d", l);
     571                 :             iutf = 0;           /* keep compiler quiet */
     572                 :         }
     573 CBC         702 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     574                 : 
     575                 :         /* First, try with combined map if possible */
     576             702 :         if (cmap && len > l)
     577                 :         {
     578              72 :             const unsigned char *utf_save = utf;
     579              72 :             int         len_save = len;
     580              72 :             int         l_save = l;
     581                 : 
     582                 :             /* collect next character, same as above */
     583              72 :             len -= l;
     584                 : 
     585              72 :             l = pg_utf_mblen(utf);
     586              72 :             if (len < l)
     587                 :             {
     588                 :                 /* need more data to decide if this is a combined char */
     589              18 :                 utf -= l_save;
     590              18 :                 break;
     591                 :             }
     592                 : 
     593              54 :             if (!pg_utf8_islegal(utf, l))
     594                 :             {
     595 UBC           0 :                 if (!noError)
     596               0 :                     report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     597               0 :                 utf -= l_save;
     598               0 :                 break;
     599                 :             }
     600                 : 
     601                 :             /* We assume ASCII character cannot be in combined map */
     602 CBC          54 :             if (l > 1)
     603                 :             {
     604                 :                 uint32      iutf2;
     605                 :                 uint32      cutf[2];
     606                 : 
     607              54 :                 if (l == 2)
     608                 :                 {
     609              27 :                     iutf2 = *utf++ << 8;
     610              27 :                     iutf2 |= *utf++;
     611                 :                 }
     612              27 :                 else if (l == 3)
     613                 :                 {
     614              27 :                     iutf2 = *utf++ << 16;
     615              27 :                     iutf2 |= *utf++ << 8;
     616              27 :                     iutf2 |= *utf++;
     617                 :                 }
     618 UBC           0 :                 else if (l == 4)
     619                 :                 {
     620               0 :                     iutf2 = *utf++ << 24;
     621               0 :                     iutf2 |= *utf++ << 16;
     622               0 :                     iutf2 |= *utf++ << 8;
     623               0 :                     iutf2 |= *utf++;
     624                 :                 }
     625                 :                 else
     626                 :                 {
     627               0 :                     elog(ERROR, "unsupported character length %d", l);
     628                 :                     iutf2 = 0;  /* keep compiler quiet */
     629                 :                 }
     630                 : 
     631 CBC          54 :                 cutf[0] = iutf;
     632              54 :                 cutf[1] = iutf2;
     633                 : 
     634              54 :                 cp = bsearch(cutf, cmap, cmapsize,
     635                 :                              sizeof(pg_utf_to_local_combined), compare3);
     636                 : 
     637              54 :                 if (cp)
     638                 :                 {
     639               9 :                     iso = store_coded_char(iso, cp->code);
     640               9 :                     continue;
     641                 :                 }
     642                 :             }
     643                 : 
     644                 :             /* fail, so back up to reprocess second character next time */
     645              45 :             utf = utf_save;
     646              45 :             len = len_save;
     647              45 :             l = l_save;
     648                 :         }
     649                 : 
     650                 :         /* Now check ordinary map */
     651             675 :         if (map)
     652                 :         {
     653             675 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     654                 : 
     655             675 :             if (converted)
     656                 :             {
     657             225 :                 iso = store_coded_char(iso, converted);
     658             225 :                 continue;
     659                 :             }
     660                 :         }
     661                 : 
     662                 :         /* if there's a conversion function, try that */
     663             450 :         if (conv_func)
     664                 :         {
     665              36 :             uint32      converted = (*conv_func) (iutf);
     666                 : 
     667              36 :             if (converted)
     668                 :             {
     669              36 :                 iso = store_coded_char(iso, converted);
     670              36 :                 continue;
     671                 :             }
     672                 :         }
     673                 : 
     674                 :         /* failed to translate this character */
     675             414 :         utf -= l;
     676             414 :         if (noError)
     677             207 :             break;
     678             207 :         report_untranslatable_char(PG_UTF8, encoding,
     679                 :                                    (const char *) utf, len);
     680                 :     }
     681                 : 
     682                 :     /* if we broke out of loop early, must be invalid input */
     683             897 :     if (len > 0 && !noError)
     684             198 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
     685                 : 
     686             699 :     *iso = '\0';
     687                 : 
     688             699 :     return utf - start;
     689                 : }
     690                 : 
     691                 : /*
     692                 :  * local code ---> UTF8
     693                 :  *
     694                 :  * iso: input string in local encoding (need not be null-terminated)
     695                 :  * len: length of input string (in bytes)
     696                 :  * utf: pointer to the output area (must be large enough!)
     697                 :           (output string will be null-terminated)
     698                 :  * map: conversion map for single characters
     699                 :  * cmap: conversion map for combined characters
     700                 :  *        (optional, pass NULL if none)
     701                 :  * cmapsize: number of entries in the conversion map for combined characters
     702                 :  *        (optional, pass 0 if none)
     703                 :  * conv_func: algorithmic encoding conversion function
     704                 :  *        (optional, pass NULL if none)
     705                 :  * encoding: PG identifier for the local encoding
     706                 :  *
     707                 :  * For each character, the map is consulted first; if no match, the cmap
     708                 :  * (if provided) is consulted next; if still no match, the conv_func
     709                 :  * (if provided) is applied.  An error is raised if no match is found.
     710                 :  *
     711                 :  * See pg_wchar.h for more details about the data structures used here.
     712                 :  *
     713                 :  * Returns the number of input bytes consumed.  If noError is true, this can
     714                 :  * be less than 'len'.
     715                 :  */
     716                 : int
     717             732 : LocalToUtf(const unsigned char *iso, int len,
     718                 :            unsigned char *utf,
     719                 :            const pg_mb_radix_tree *map,
     720                 :            const pg_local_to_utf_combined *cmap, int cmapsize,
     721                 :            utf_local_conversion_func conv_func,
     722                 :            int encoding,
     723                 :            bool noError)
     724                 : {
     725                 :     uint32      iiso;
     726                 :     int         l;
     727                 :     const pg_local_to_utf_combined *cp;
     728             732 :     const unsigned char *start = iso;
     729                 : 
     730             732 :     if (!PG_VALID_ENCODING(encoding))
     731 UBC           0 :         ereport(ERROR,
     732                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     733                 :                  errmsg("invalid encoding number: %d", encoding)));
     734                 : 
     735 CBC        3042 :     for (; len > 0; len -= l)
     736                 :     {
     737            2688 :         unsigned char b1 = 0;
     738            2688 :         unsigned char b2 = 0;
     739            2688 :         unsigned char b3 = 0;
     740            2688 :         unsigned char b4 = 0;
     741                 : 
     742                 :         /* "break" cases all represent errors */
     743            2688 :         if (*iso == '\0')
     744             162 :             break;
     745                 : 
     746            2526 :         if (!IS_HIGHBIT_SET(*iso))
     747                 :         {
     748                 :             /* ASCII case is easy, assume it's one-to-one conversion */
     749            1986 :             *utf++ = *iso++;
     750            1986 :             l = 1;
     751            1986 :             continue;
     752                 :         }
     753                 : 
     754             540 :         l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
     755             540 :         if (l < 0)
     756             180 :             break;
     757                 : 
     758                 :         /* collect coded char of length l */
     759             360 :         if (l == 1)
     760             144 :             b4 = *iso++;
     761             216 :         else if (l == 2)
     762                 :         {
     763             171 :             b3 = *iso++;
     764             171 :             b4 = *iso++;
     765                 :         }
     766              45 :         else if (l == 3)
     767                 :         {
     768 UBC           0 :             b2 = *iso++;
     769               0 :             b3 = *iso++;
     770               0 :             b4 = *iso++;
     771                 :         }
     772 CBC          45 :         else if (l == 4)
     773                 :         {
     774              45 :             b1 = *iso++;
     775              45 :             b2 = *iso++;
     776              45 :             b3 = *iso++;
     777              45 :             b4 = *iso++;
     778                 :         }
     779                 :         else
     780                 :         {
     781 UBC           0 :             elog(ERROR, "unsupported character length %d", l);
     782                 :             iiso = 0;           /* keep compiler quiet */
     783                 :         }
     784 CBC         360 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
     785                 : 
     786             360 :         if (map)
     787                 :         {
     788             360 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
     789                 : 
     790             360 :             if (converted)
     791                 :             {
     792             279 :                 utf = store_coded_char(utf, converted);
     793             279 :                 continue;
     794                 :             }
     795                 : 
     796                 :             /* If there's a combined character map, try that */
     797              81 :             if (cmap)
     798                 :             {
     799              18 :                 cp = bsearch(&iiso, cmap, cmapsize,
     800                 :                              sizeof(pg_local_to_utf_combined), compare4);
     801                 : 
     802              18 :                 if (cp)
     803                 :                 {
     804              18 :                     utf = store_coded_char(utf, cp->utf1);
     805              18 :                     utf = store_coded_char(utf, cp->utf2);
     806              18 :                     continue;
     807                 :                 }
     808                 :             }
     809                 :         }
     810                 : 
     811                 :         /* if there's a conversion function, try that */
     812              63 :         if (conv_func)
     813                 :         {
     814              45 :             uint32      converted = (*conv_func) (iiso);
     815                 : 
     816              45 :             if (converted)
     817                 :             {
     818              27 :                 utf = store_coded_char(utf, converted);
     819              27 :                 continue;
     820                 :             }
     821                 :         }
     822                 : 
     823                 :         /* failed to translate this character */
     824              36 :         iso -= l;
     825              36 :         if (noError)
     826              18 :             break;
     827              18 :         report_untranslatable_char(encoding, PG_UTF8,
     828                 :                                    (const char *) iso, len);
     829                 :     }
     830                 : 
     831                 :     /* if we broke out of loop early, must be invalid input */
     832             714 :     if (len > 0 && !noError)
     833             171 :         report_invalid_encoding(encoding, (const char *) iso, len);
     834                 : 
     835             543 :     *utf = '\0';
     836                 : 
     837             543 :     return iso - start;
     838                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a