LCOV - differential code coverage report
Current view: top level - src/backend/utils/mb - conv.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 84.6 % 325 275 50 275
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 11 11 11
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (240..) days: 84.6 % 325 275 50 275
Legend: Lines: hit not hit Function coverage date bins:
(240..) days: 100.0 % 11 11 11

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  *    Utility functions for conversion procs.
                                  4                 :  *
                                  5                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  6                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                  7                 :  *
                                  8                 :  * IDENTIFICATION
                                  9                 :  *    src/backend/utils/mb/conv.c
                                 10                 :  *
                                 11                 :  *-------------------------------------------------------------------------
                                 12                 :  */
                                 13                 : #include "postgres.h"
                                 14                 : #include "mb/pg_wchar.h"
                                 15                 : 
                                 16                 : 
                                 17                 : /*
                                 18                 :  * local2local: a generic single byte charset encoding
                                 19                 :  * conversion between two ASCII-superset encodings.
                                 20                 :  *
                                 21                 :  * l points to the source string of length len
                                 22                 :  * p is the output area (must be large enough!)
                                 23                 :  * src_encoding is the PG identifier for the source encoding
                                 24                 :  * dest_encoding is the PG identifier for the target encoding
                                 25                 :  * tab holds conversion entries for the source charset
                                 26                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
                                 27                 :  * code point for the target charset, or 0 if there is no equivalent code.
                                 28                 :  *
                                 29                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                 30                 :  * be less than 'len'.
                                 31                 :  */
                                 32                 : int
 2689 tgl                        33 CBC         114 : local2local(const unsigned char *l,
                                 34                 :             unsigned char *p,
                                 35                 :             int len,
                                 36                 :             int src_encoding,
                                 37                 :             int dest_encoding,
                                 38                 :             const unsigned char *tab,
                                 39                 :             bool noError)
                                 40                 : {
  738 heikki.linnakangas         41             114 :     const unsigned char *start = l;
                                 42                 :     unsigned char c1,
                                 43                 :                 c2;
                                 44                 : 
 2689 tgl                        45             366 :     while (len > 0)
                                 46                 :     {
                                 47             306 :         c1 = *l;
                                 48             306 :         if (c1 == 0)
                                 49                 :         {
  738 heikki.linnakangas         50              54 :             if (noError)
                                 51              27 :                 break;
 2689 tgl                        52              27 :             report_invalid_encoding(src_encoding, (const char *) l, len);
                                 53                 :         }
                                 54             252 :         if (!IS_HIGHBIT_SET(c1))
                                 55             153 :             *p++ = c1;
                                 56                 :         else
                                 57                 :         {
                                 58              99 :             c2 = tab[c1 - HIGHBIT];
                                 59              99 :             if (c2)
                                 60              99 :                 *p++ = c2;
                                 61                 :             else
                                 62                 :             {
  738 heikki.linnakangas         63 UBC           0 :                 if (noError)
                                 64               0 :                     break;
 2689 tgl                        65               0 :                 report_untranslatable_char(src_encoding, dest_encoding,
                                 66                 :                                            (const char *) l, len);
                                 67                 :             }
                                 68                 :         }
 2689 tgl                        69 CBC         252 :         l++;
                                 70             252 :         len--;
                                 71                 :     }
                                 72              87 :     *p = '\0';
                                 73                 : 
  738 heikki.linnakangas         74              87 :     return l - start;
                                 75                 : }
                                 76                 : 
                                 77                 : /*
                                 78                 :  * LATINn ---> MIC when the charset's local codes map directly to MIC
                                 79                 :  *
                                 80                 :  * l points to the source string of length len
                                 81                 :  * p is the output area (must be large enough!)
                                 82                 :  * lc is the mule character set id for the local encoding
                                 83                 :  * encoding is the PG identifier for the local encoding
                                 84                 :  *
                                 85                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                 86                 :  * be less than 'len'.
                                 87                 :  */
                                 88                 : int
 6167 tgl                        89              15 : latin2mic(const unsigned char *l, unsigned char *p, int len,
                                 90                 :           int lc, int encoding, bool noError)
                                 91                 : {
  738 heikki.linnakangas         92              15 :     const unsigned char *start = l;
                                 93                 :     int         c1;
                                 94                 : 
 6167 tgl                        95              60 :     while (len > 0)
                                 96                 :     {
                                 97              45 :         c1 = *l;
                                 98              45 :         if (c1 == 0)
                                 99                 :         {
  738 heikki.linnakangas        100 UBC           0 :             if (noError)
                                101               0 :                 break;
 6167 tgl                       102               0 :             report_invalid_encoding(encoding, (const char *) l, len);
                                103                 :         }
 6313 bruce                     104 CBC          45 :         if (IS_HIGHBIT_SET(c1))
 6167 tgl                       105 UBC           0 :             *p++ = lc;
 8986 bruce                     106 CBC          45 :         *p++ = c1;
 6167 tgl                       107              45 :         l++;
                                108              45 :         len--;
                                109                 :     }
 8986 bruce                     110              15 :     *p = '\0';
                                111                 : 
  738 heikki.linnakangas        112              15 :     return l - start;
                                113                 : }
                                114                 : 
                                115                 : /*
                                116                 :  * MIC ---> LATINn when the charset's local codes map directly to MIC
                                117                 :  *
                                118                 :  * mic points to the source string of length len
                                119                 :  * p is the output area (must be large enough!)
                                120                 :  * lc is the mule character set id for the local encoding
                                121                 :  * encoding is the PG identifier for the local encoding
                                122                 :  *
                                123                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                124                 :  * be less than 'len'.
                                125                 :  */
                                126                 : int
 6167 tgl                       127             177 : mic2latin(const unsigned char *mic, unsigned char *p, int len,
                                128                 :           int lc, int encoding, bool noError)
                                129                 : {
  738 heikki.linnakangas        130             177 :     const unsigned char *start = mic;
                                131                 :     int         c1;
                                132                 : 
 6167 tgl                       133             420 :     while (len > 0)
                                134                 :     {
                                135             387 :         c1 = *mic;
                                136             387 :         if (c1 == 0)
                                137                 :         {
  738 heikki.linnakangas        138 UBC           0 :             if (noError)
                                139               0 :                 break;
 6167 tgl                       140               0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
                                141                 :         }
 6167 tgl                       142 CBC         387 :         if (!IS_HIGHBIT_SET(c1))
                                143                 :         {
                                144                 :             /* easy for ASCII */
                                145             180 :             *p++ = c1;
                                146             180 :             mic++;
                                147             180 :             len--;
                                148                 :         }
                                149                 :         else
                                150                 :         {
 1179                           151             207 :             int         l = pg_mule_mblen(mic);
                                152                 : 
 6167                           153             207 :             if (len < l)
                                154                 :             {
  738 heikki.linnakangas        155              54 :                 if (noError)
                                156              27 :                     break;
 6167 tgl                       157              27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                                158                 :                                         len);
                                159                 :             }
                                160             153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
                                161                 :             {
  738 heikki.linnakangas        162              90 :                 if (noError)
                                163              45 :                     break;
 6167 tgl                       164              45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                                165                 :                                            (const char *) mic, len);
                                166                 :             }
                                167              63 :             *p++ = mic[1];
                                168              63 :             mic += 2;
                                169              63 :             len -= 2;
                                170                 :         }
                                171                 :     }
 8986 bruce                     172             105 :     *p = '\0';
                                173                 : 
  738 heikki.linnakangas        174             105 :     return mic - start;
                                175                 : }
                                176                 : 
                                177                 : 
                                178                 : /*
                                179                 :  * latin2mic_with_table: a generic single byte charset encoding
                                180                 :  * conversion from a local charset to the mule internal code.
                                181                 :  *
                                182                 :  * l points to the source string of length len
                                183                 :  * p is the output area (must be large enough!)
                                184                 :  * lc is the mule character set id for the local encoding
                                185                 :  * encoding is the PG identifier for the local encoding
                                186                 :  * tab holds conversion entries for the local charset
                                187                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
                                188                 :  * code point for the mule encoding, or 0 if there is no equivalent code.
                                189                 :  *
                                190                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                191                 :  * be less than 'len'.
                                192                 :  */
                                193                 : int
 6167 tgl                       194              84 : latin2mic_with_table(const unsigned char *l,
                                195                 :                      unsigned char *p,
                                196                 :                      int len,
                                197                 :                      int lc,
                                198                 :                      int encoding,
                                199                 :                      const unsigned char *tab,
                                200                 :                      bool noError)
                                201                 : {
  738 heikki.linnakangas        202              84 :     const unsigned char *start = l;
                                203                 :     unsigned char c1,
                                204                 :                 c2;
                                205                 : 
 6167 tgl                       206             246 :     while (len > 0)
                                207                 :     {
                                208             216 :         c1 = *l;
                                209             216 :         if (c1 == 0)
                                210                 :         {
  738 heikki.linnakangas        211              54 :             if (noError)
                                212              27 :                 break;
 6167 tgl                       213              27 :             report_invalid_encoding(encoding, (const char *) l, len);
                                214                 :         }
                                215             162 :         if (!IS_HIGHBIT_SET(c1))
 8720 bruce                     216              63 :             *p++ = c1;
                                217                 :         else
                                218                 :         {
 6167 tgl                       219              99 :             c2 = tab[c1 - HIGHBIT];
 8720 bruce                     220              99 :             if (c2)
                                221                 :             {
                                222              99 :                 *p++ = lc;
                                223              99 :                 *p++ = c2;
                                224                 :             }
                                225                 :             else
                                226                 :             {
  738 heikki.linnakangas        227 UBC           0 :                 if (noError)
                                228               0 :                     break;
 6167 tgl                       229               0 :                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
                                230                 :                                            (const char *) l, len);
                                231                 :             }
                                232                 :         }
 6167 tgl                       233 CBC         162 :         l++;
                                234             162 :         len--;
                                235                 :     }
 8720 bruce                     236              57 :     *p = '\0';
                                237                 : 
  738 heikki.linnakangas        238              57 :     return l - start;
                                239                 : }
                                240                 : 
                                241                 : /*
                                242                 :  * mic2latin_with_table: a generic single byte charset encoding
                                243                 :  * conversion from the mule internal code to a local charset.
                                244                 :  *
                                245                 :  * mic points to the source string of length len
                                246                 :  * p is the output area (must be large enough!)
                                247                 :  * lc is the mule character set id for the local encoding
                                248                 :  * encoding is the PG identifier for the local encoding
                                249                 :  * tab holds conversion entries for the mule internal code's second byte,
                                250                 :  * starting from 128 (0x80). each entry in the table holds the corresponding
                                251                 :  * code point for the local charset, or 0 if there is no equivalent code.
                                252                 :  *
                                253                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                254                 :  * be less than 'len'.
                                255                 :  */
                                256                 : int
 6167 tgl                       257             174 : mic2latin_with_table(const unsigned char *mic,
                                258                 :                      unsigned char *p,
                                259                 :                      int len,
                                260                 :                      int lc,
                                261                 :                      int encoding,
                                262                 :                      const unsigned char *tab,
                                263                 :                      bool noError)
                                264                 : {
  738 heikki.linnakangas        265             174 :     const unsigned char *start = mic;
                                266                 :     unsigned char c1,
                                267                 :                 c2;
                                268                 : 
 6167 tgl                       269             408 :     while (len > 0)
                                270                 :     {
                                271             378 :         c1 = *mic;
                                272             378 :         if (c1 == 0)
                                273                 :         {
  738 heikki.linnakangas        274 UBC           0 :             if (noError)
                                275               0 :                 break;
 6167 tgl                       276               0 :             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
                                277                 :         }
 6167 tgl                       278 CBC         378 :         if (!IS_HIGHBIT_SET(c1))
                                279                 :         {
                                280                 :             /* easy for ASCII */
                                281             171 :             *p++ = c1;
                                282             171 :             mic++;
 8720 bruce                     283             171 :             len--;
                                284                 :         }
                                285                 :         else
                                286                 :         {
 1179 tgl                       287             207 :             int         l = pg_mule_mblen(mic);
                                288                 : 
 6167                           289             207 :             if (len < l)
                                290                 :             {
  738 heikki.linnakangas        291              54 :                 if (noError)
                                292              27 :                     break;
 6167 tgl                       293              27 :                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
                                294                 :                                         len);
                                295                 :             }
                                296             153 :             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
                                297              63 :                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
                                298                 :             {
  738 heikki.linnakangas        299              90 :                 if (noError)
                                300              45 :                     break;
 6167 tgl                       301              45 :                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
                                302                 :                                            (const char *) mic, len);
                                303                 :                 break;          /* keep compiler quiet */
                                304                 :             }
                                305              63 :             *p++ = c2;
                                306              63 :             mic += 2;
                                307              63 :             len -= 2;
                                308                 :         }
                                309                 :     }
 8720 bruce                     310             102 :     *p = '\0';
                                311                 : 
  738 heikki.linnakangas        312             102 :     return mic - start;
                                313                 : }
                                314                 : 
                                315                 : /*
                                316                 :  * comparison routine for bsearch()
                                317                 :  * this routine is intended for combined UTF8 -> local code
                                318                 :  */
                                319                 : static int
 5859 ishii                     320             234 : compare3(const void *p1, const void *p2)
                                321                 : {
                                322                 :     uint32      s1,
                                323                 :                 s2,
                                324                 :                 d1,
                                325                 :                 d2;
                                326                 : 
 4228 peter_e                   327             234 :     s1 = *(const uint32 *) p1;
                                328             234 :     s2 = *((const uint32 *) p1 + 1);
                                329             234 :     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
                                330             234 :     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
 5859 ishii                     331             234 :     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
                                332                 : }
                                333                 : 
                                334                 : /*
                                335                 :  * comparison routine for bsearch()
                                336                 :  * this routine is intended for local code -> combined UTF8
                                337                 :  */
                                338                 : static int
                                339              81 : compare4(const void *p1, const void *p2)
                                340                 : {
                                341                 :     uint32      v1,
                                342                 :                 v2;
                                343                 : 
 4228 peter_e                   344              81 :     v1 = *(const uint32 *) p1;
                                345              81 :     v2 = ((const pg_local_to_utf_combined *) p2)->code;
 5859 ishii                     346              81 :     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
                                347                 : }
                                348                 : 
                                349                 : /*
                                350                 :  * store 32bit character representation into multibyte stream
                                351                 :  */
                                352                 : static inline unsigned char *
 2887 tgl                       353             612 : store_coded_char(unsigned char *dest, uint32 code)
                                354                 : {
 5859 ishii                     355             612 :     if (code & 0xff000000)
 2887 tgl                       356              63 :         *dest++ = code >> 24;
 5859 ishii                     357             612 :     if (code & 0x00ff0000)
 2887 tgl                       358             261 :         *dest++ = code >> 16;
 5859 ishii                     359             612 :     if (code & 0x0000ff00)
 2887 tgl                       360             549 :         *dest++ = code >> 8;
 5859 ishii                     361             612 :     if (code & 0x000000ff)
 2887 tgl                       362             612 :         *dest++ = code;
                                363             612 :     return dest;
                                364                 : }
                                365                 : 
                                366                 : /*
                                367                 :  * Convert a character using a conversion radix tree.
                                368                 :  *
                                369                 :  * 'l' is the length of the input character in bytes, and b1-b4 are
                                370                 :  * the input character's bytes.
                                371                 :  */
                                372                 : static inline uint32
 2218 heikki.linnakangas        373            1035 : pg_mb_radix_conv(const pg_mb_radix_tree *rt,
                                374                 :                  int l,
                                375                 :                  unsigned char b1,
                                376                 :                  unsigned char b2,
                                377                 :                  unsigned char b3,
                                378                 :                  unsigned char b4)
                                379                 : {
                                380            1035 :     if (l == 4)
                                381                 :     {
                                382                 :         /* 4-byte code */
                                383                 : 
                                384                 :         /* check code validity */
                                385              45 :         if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
                                386              45 :             b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
                                387              45 :             b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
                                388              45 :             b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
 2218 heikki.linnakangas        389 UBC           0 :             return 0;
                                390                 : 
                                391                 :         /* perform lookup */
 2218 heikki.linnakangas        392 CBC          45 :         if (rt->chars32)
                                393                 :         {
                                394              45 :             uint32      idx = rt->b4root;
                                395                 : 
                                396              45 :             idx = rt->chars32[b1 + idx - rt->b4_1_lower];
                                397              45 :             idx = rt->chars32[b2 + idx - rt->b4_2_lower];
                                398              45 :             idx = rt->chars32[b3 + idx - rt->b4_3_lower];
                                399              45 :             return rt->chars32[b4 + idx - rt->b4_4_lower];
                                400                 :         }
                                401                 :         else
                                402                 :         {
 2218 heikki.linnakangas        403 UBC           0 :             uint16      idx = rt->b4root;
                                404                 : 
                                405               0 :             idx = rt->chars16[b1 + idx - rt->b4_1_lower];
                                406               0 :             idx = rt->chars16[b2 + idx - rt->b4_2_lower];
                                407               0 :             idx = rt->chars16[b3 + idx - rt->b4_3_lower];
                                408               0 :             return rt->chars16[b4 + idx - rt->b4_4_lower];
                                409                 :         }
                                410                 :     }
 2218 heikki.linnakangas        411 CBC         990 :     else if (l == 3)
                                412                 :     {
                                413                 :         /* 3-byte code */
                                414                 : 
                                415                 :         /* check code validity */
                                416             468 :         if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
                                417             144 :             b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
                                418             144 :             b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
                                419             324 :             return 0;
                                420                 : 
                                421                 :         /* perform lookup */
                                422             144 :         if (rt->chars32)
                                423                 :         {
                                424             144 :             uint32      idx = rt->b3root;
                                425                 : 
                                426             144 :             idx = rt->chars32[b2 + idx - rt->b3_1_lower];
                                427             144 :             idx = rt->chars32[b3 + idx - rt->b3_2_lower];
                                428             144 :             return rt->chars32[b4 + idx - rt->b3_3_lower];
                                429                 :         }
                                430                 :         else
                                431                 :         {
 2218 heikki.linnakangas        432 UBC           0 :             uint16      idx = rt->b3root;
                                433                 : 
                                434               0 :             idx = rt->chars16[b2 + idx - rt->b3_1_lower];
                                435               0 :             idx = rt->chars16[b3 + idx - rt->b3_2_lower];
                                436               0 :             return rt->chars16[b4 + idx - rt->b3_3_lower];
                                437                 :         }
                                438                 :     }
 2218 heikki.linnakangas        439 CBC         522 :     else if (l == 2)
                                440                 :     {
                                441                 :         /* 2-byte code */
                                442                 : 
                                443                 :         /* check code validity - first byte */
                                444             378 :         if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
                                445             342 :             b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
                                446              36 :             return 0;
                                447                 : 
                                448                 :         /* perform lookup */
                                449             342 :         if (rt->chars32)
                                450                 :         {
                                451             261 :             uint32      idx = rt->b2root;
                                452                 : 
                                453             261 :             idx = rt->chars32[b3 + idx - rt->b2_1_lower];
                                454             261 :             return rt->chars32[b4 + idx - rt->b2_2_lower];
                                455                 :         }
                                456                 :         else
                                457                 :         {
                                458              81 :             uint16      idx = rt->b2root;
                                459                 : 
                                460              81 :             idx = rt->chars16[b3 + idx - rt->b2_1_lower];
                                461              81 :             return rt->chars16[b4 + idx - rt->b2_2_lower];
                                462                 :         }
                                463                 :     }
                                464             144 :     else if (l == 1)
                                465                 :     {
                                466                 :         /* 1-byte code */
                                467                 : 
                                468                 :         /* check code validity - first byte */
                                469             144 :         if (b4 < rt->b1_lower || b4 > rt->b1_upper)
 2218 heikki.linnakangas        470 UBC           0 :             return 0;
                                471                 : 
                                472                 :         /* perform lookup */
 2218 heikki.linnakangas        473 CBC         144 :         if (rt->chars32)
                                474              99 :             return rt->chars32[b4 + rt->b1root - rt->b1_lower];
                                475                 :         else
                                476              45 :             return rt->chars16[b4 + rt->b1root - rt->b1_lower];
                                477                 :     }
 2153 bruce                     478 UBC           0 :     return 0;                   /* shouldn't happen */
                                479                 : }
                                480                 : 
                                481                 : /*
                                482                 :  * UTF8 ---> local code
                                483                 :  *
                                484                 :  * utf: input string in UTF8 encoding (need not be null-terminated)
                                485                 :  * len: length of input string (in bytes)
                                486                 :  * iso: pointer to the output area (must be large enough!)
                                487                 :           (output string will be null-terminated)
                                488                 :  * map: conversion map for single characters
                                489                 :  * cmap: conversion map for combined characters
                                490                 :  *        (optional, pass NULL if none)
                                491                 :  * cmapsize: number of entries in the conversion map for combined characters
                                492                 :  *        (optional, pass 0 if none)
                                493                 :  * conv_func: algorithmic encoding conversion function
                                494                 :  *        (optional, pass NULL if none)
                                495                 :  * encoding: PG identifier for the local encoding
                                496                 :  *
                                497                 :  * For each character, the cmap (if provided) is consulted first; if no match,
                                498                 :  * the map is consulted next; if still no match, the conv_func (if provided)
                                499                 :  * is applied.  An error is raised if no match is found.
                                500                 :  *
                                501                 :  * See pg_wchar.h for more details about the data structures used here.
                                502                 :  *
                                503                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                504                 :  * be less than 'len'.
                                505                 :  */
                                506                 : int
 2887 tgl                       507 CBC        1104 : UtfToLocal(const unsigned char *utf, int len,
                                508                 :            unsigned char *iso,
                                509                 :            const pg_mb_radix_tree *map,
                                510                 :            const pg_utf_to_local_combined *cmap, int cmapsize,
                                511                 :            utf_local_conversion_func conv_func,
                                512                 :            int encoding, bool noError)
                                513                 : {
                                514                 :     uint32      iutf;
                                515                 :     int         l;
                                516                 :     const pg_utf_to_local_combined *cp;
  738 heikki.linnakangas        517            1104 :     const unsigned char *start = utf;
                                518                 : 
 2887 tgl                       519            1104 :     if (!PG_VALID_ENCODING(encoding))
 2887 tgl                       520 UBC           0 :         ereport(ERROR,
                                521                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                522                 :                  errmsg("invalid encoding number: %d", encoding)));
                                523                 : 
 6167 tgl                       524 CBC        3030 :     for (; len > 0; len -= l)
                                525                 :     {
 2218 heikki.linnakangas        526            2736 :         unsigned char b1 = 0;
                                527            2736 :         unsigned char b2 = 0;
                                528            2736 :         unsigned char b3 = 0;
                                529            2736 :         unsigned char b4 = 0;
                                530                 : 
                                531                 :         /* "break" cases all represent errors */
 6167 tgl                       532            2736 :         if (*utf == '\0')
                                533              90 :             break;
                                534                 : 
 8214 ishii                     535            2646 :         l = pg_utf_mblen(utf);
 6167 tgl                       536            2646 :         if (len < l)
                                537             108 :             break;
                                538                 : 
                                539            2538 :         if (!pg_utf8_islegal(utf, l))
                                540             180 :             break;
                                541                 : 
 8214 ishii                     542            2358 :         if (l == 1)
                                543                 :         {
                                544                 :             /* ASCII case is easy, assume it's one-to-one conversion */
                                545            1656 :             *iso++ = *utf++;
                                546            1656 :             continue;
                                547                 :         }
                                548                 : 
                                549                 :         /* collect coded char of length l */
 2887 tgl                       550             702 :         if (l == 2)
                                551                 :         {
 2218 heikki.linnakangas        552             207 :             b3 = *utf++;
                                553             207 :             b4 = *utf++;
                                554                 :         }
 6507 bruce                     555             495 :         else if (l == 3)
                                556                 :         {
 2218 heikki.linnakangas        557             495 :             b2 = *utf++;
                                558             495 :             b3 = *utf++;
                                559             495 :             b4 = *utf++;
                                560                 :         }
 6507 bruce                     561 UBC           0 :         else if (l == 4)
                                562                 :         {
 2218 heikki.linnakangas        563               0 :             b1 = *utf++;
                                564               0 :             b2 = *utf++;
                                565               0 :             b3 = *utf++;
                                566               0 :             b4 = *utf++;
                                567                 :         }
                                568                 :         else
                                569                 :         {
 3552 tgl                       570               0 :             elog(ERROR, "unsupported character length %d", l);
                                571                 :             iutf = 0;           /* keep compiler quiet */
                                572                 :         }
 2218 heikki.linnakangas        573 CBC         702 :         iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
                                574                 : 
                                575                 :         /* First, try with combined map if possible */
 5859 ishii                     576             702 :         if (cmap && len > l)
                                577                 :         {
 5624 bruce                     578              72 :             const unsigned char *utf_save = utf;
                                579              72 :             int         len_save = len;
                                580              72 :             int         l_save = l;
                                581                 : 
                                582                 :             /* collect next character, same as above */
 5859 ishii                     583              72 :             len -= l;
                                584                 : 
                                585              72 :             l = pg_utf_mblen(utf);
                                586              72 :             if (len < l)
                                587                 :             {
                                588                 :                 /* need more data to decide if this is a combined char */
  738 heikki.linnakangas        589              18 :                 utf -= l_save;
 5859 ishii                     590              18 :                 break;
                                591                 :             }
                                592                 : 
                                593              54 :             if (!pg_utf8_islegal(utf, l))
                                594                 :             {
  738 heikki.linnakangas        595 UBC           0 :                 if (!noError)
                                596               0 :                     report_invalid_encoding(PG_UTF8, (const char *) utf, len);
                                597               0 :                 utf -= l_save;
 5859 ishii                     598               0 :                 break;
                                599                 :             }
                                600                 : 
                                601                 :             /* We assume ASCII character cannot be in combined map */
 2887 tgl                       602 CBC          54 :             if (l > 1)
                                603                 :             {
                                604                 :                 uint32      iutf2;
                                605                 :                 uint32      cutf[2];
                                606                 : 
                                607              54 :                 if (l == 2)
                                608                 :                 {
                                609              27 :                     iutf2 = *utf++ << 8;
                                610              27 :                     iutf2 |= *utf++;
                                611                 :                 }
                                612              27 :                 else if (l == 3)
                                613                 :                 {
                                614              27 :                     iutf2 = *utf++ << 16;
                                615              27 :                     iutf2 |= *utf++ << 8;
                                616              27 :                     iutf2 |= *utf++;
                                617                 :                 }
 2887 tgl                       618 UBC           0 :                 else if (l == 4)
                                619                 :                 {
                                620               0 :                     iutf2 = *utf++ << 24;
                                621               0 :                     iutf2 |= *utf++ << 16;
                                622               0 :                     iutf2 |= *utf++ << 8;
                                623               0 :                     iutf2 |= *utf++;
                                624                 :                 }
                                625                 :                 else
                                626                 :                 {
                                627               0 :                     elog(ERROR, "unsupported character length %d", l);
                                628                 :                     iutf2 = 0;  /* keep compiler quiet */
                                629                 :                 }
                                630                 : 
 2887 tgl                       631 CBC          54 :                 cutf[0] = iutf;
                                632              54 :                 cutf[1] = iutf2;
                                633                 : 
                                634              54 :                 cp = bsearch(cutf, cmap, cmapsize,
                                635                 :                              sizeof(pg_utf_to_local_combined), compare3);
                                636                 : 
                                637              54 :                 if (cp)
                                638                 :                 {
                                639               9 :                     iso = store_coded_char(iso, cp->code);
                                640               9 :                     continue;
                                641                 :                 }
                                642                 :             }
                                643                 : 
                                644                 :             /* fail, so back up to reprocess second character next time */
                                645              45 :             utf = utf_save;
                                646              45 :             len = len_save;
                                647              45 :             l = l_save;
                                648                 :         }
                                649                 : 
                                650                 :         /* Now check ordinary map */
 2218 heikki.linnakangas        651             675 :         if (map)
                                652                 :         {
 2153 bruce                     653             675 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
                                654                 : 
 2218 heikki.linnakangas        655             675 :             if (converted)
                                656                 :             {
                                657             225 :                 iso = store_coded_char(iso, converted);
                                658             225 :                 continue;
                                659                 :             }
                                660                 :         }
                                661                 : 
                                662                 :         /* if there's a conversion function, try that */
 2887 tgl                       663             450 :         if (conv_func)
                                664                 :         {
                                665              36 :             uint32      converted = (*conv_func) (iutf);
                                666                 : 
                                667              36 :             if (converted)
                                668                 :             {
                                669              36 :                 iso = store_coded_char(iso, converted);
                                670              36 :                 continue;
                                671                 :             }
                                672                 :         }
                                673                 : 
                                674                 :         /* failed to translate this character */
  738 heikki.linnakangas        675             414 :         utf -= l;
                                676             414 :         if (noError)
                                677             207 :             break;
 2887 tgl                       678             207 :         report_untranslatable_char(PG_UTF8, encoding,
                                679                 :                                    (const char *) utf, len);
                                680                 :     }
                                681                 : 
                                682                 :     /* if we broke out of loop early, must be invalid input */
  738 heikki.linnakangas        683             897 :     if (len > 0 && !noError)
 6167 tgl                       684             198 :         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
                                685                 : 
 8214 ishii                     686             699 :     *iso = '\0';
                                687                 : 
  738 heikki.linnakangas        688             699 :     return utf - start;
                                689                 : }
                                690                 : 
                                691                 : /*
                                692                 :  * local code ---> UTF8
                                693                 :  *
                                694                 :  * iso: input string in local encoding (need not be null-terminated)
                                695                 :  * len: length of input string (in bytes)
                                696                 :  * utf: pointer to the output area (must be large enough!)
                                697                 :           (output string will be null-terminated)
                                698                 :  * map: conversion map for single characters
                                699                 :  * cmap: conversion map for combined characters
                                700                 :  *        (optional, pass NULL if none)
                                701                 :  * cmapsize: number of entries in the conversion map for combined characters
                                702                 :  *        (optional, pass 0 if none)
                                703                 :  * conv_func: algorithmic encoding conversion function
                                704                 :  *        (optional, pass NULL if none)
                                705                 :  * encoding: PG identifier for the local encoding
                                706                 :  *
                                707                 :  * For each character, the map is consulted first; if no match, the cmap
                                708                 :  * (if provided) is consulted next; if still no match, the conv_func
                                709                 :  * (if provided) is applied.  An error is raised if no match is found.
                                710                 :  *
                                711                 :  * See pg_wchar.h for more details about the data structures used here.
                                712                 :  *
                                713                 :  * Returns the number of input bytes consumed.  If noError is true, this can
                                714                 :  * be less than 'len'.
                                715                 :  */
                                716                 : int
 2887 tgl                       717             732 : LocalToUtf(const unsigned char *iso, int len,
                                718                 :            unsigned char *utf,
                                719                 :            const pg_mb_radix_tree *map,
                                720                 :            const pg_local_to_utf_combined *cmap, int cmapsize,
                                721                 :            utf_local_conversion_func conv_func,
                                722                 :            int encoding,
                                723                 :            bool noError)
                                724                 : {
                                725                 :     uint32      iiso;
                                726                 :     int         l;
                                727                 :     const pg_local_to_utf_combined *cp;
  738 heikki.linnakangas        728             732 :     const unsigned char *start = iso;
                                729                 : 
 7885 ishii                     730             732 :     if (!PG_VALID_ENCODING(encoding))
 7198 tgl                       731 UBC           0 :         ereport(ERROR,
                                732                 :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                733                 :                  errmsg("invalid encoding number: %d", encoding)));
                                734                 : 
 6167 tgl                       735 CBC        3042 :     for (; len > 0; len -= l)
                                736                 :     {
 2218 heikki.linnakangas        737            2688 :         unsigned char b1 = 0;
                                738            2688 :         unsigned char b2 = 0;
                                739            2688 :         unsigned char b3 = 0;
                                740            2688 :         unsigned char b4 = 0;
                                741                 : 
                                742                 :         /* "break" cases all represent errors */
 6167 tgl                       743            2688 :         if (*iso == '\0')
                                744             162 :             break;
                                745                 : 
 6314 bruce                     746            2526 :         if (!IS_HIGHBIT_SET(*iso))
                                747                 :         {
                                748                 :             /* ASCII case is easy, assume it's one-to-one conversion */
 8214 ishii                     749            1986 :             *utf++ = *iso++;
                                750            1986 :             l = 1;
                                751            1986 :             continue;
                                752                 :         }
                                753                 : 
  801 heikki.linnakangas        754             540 :         l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
 6167 tgl                       755             540 :         if (l < 0)
                                756             180 :             break;
                                757                 : 
                                758                 :         /* collect coded char of length l */
 8214 ishii                     759             360 :         if (l == 1)
 2218 heikki.linnakangas        760             144 :             b4 = *iso++;
 8214 ishii                     761             216 :         else if (l == 2)
                                762                 :         {
 2218 heikki.linnakangas        763             171 :             b3 = *iso++;
                                764             171 :             b4 = *iso++;
                                765                 :         }
 8214 ishii                     766              45 :         else if (l == 3)
                                767                 :         {
 2218 heikki.linnakangas        768 UBC           0 :             b2 = *iso++;
                                769               0 :             b3 = *iso++;
                                770               0 :             b4 = *iso++;
                                771                 :         }
 8214 ishii                     772 CBC          45 :         else if (l == 4)
                                773                 :         {
 2218 heikki.linnakangas        774              45 :             b1 = *iso++;
                                775              45 :             b2 = *iso++;
                                776              45 :             b3 = *iso++;
                                777              45 :             b4 = *iso++;
                                778                 :         }
                                779                 :         else
                                780                 :         {
 3552 tgl                       781 UBC           0 :             elog(ERROR, "unsupported character length %d", l);
                                782                 :             iiso = 0;           /* keep compiler quiet */
                                783                 :         }
 2218 heikki.linnakangas        784 CBC         360 :         iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
                                785                 : 
                                786             360 :         if (map)
                                787                 :         {
 2153 bruce                     788             360 :             uint32      converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
                                789                 : 
 2218 heikki.linnakangas        790             360 :             if (converted)
                                791                 :             {
                                792             279 :                 utf = store_coded_char(utf, converted);
 2887 tgl                       793             279 :                 continue;
                                794                 :             }
                                795                 : 
                                796                 :             /* If there's a combined character map, try that */
 2218 heikki.linnakangas        797              81 :             if (cmap)
                                798                 :             {
                                799              18 :                 cp = bsearch(&iiso, cmap, cmapsize,
                                800                 :                              sizeof(pg_local_to_utf_combined), compare4);
                                801                 : 
                                802              18 :                 if (cp)
                                803                 :                 {
                                804              18 :                     utf = store_coded_char(utf, cp->utf1);
                                805              18 :                     utf = store_coded_char(utf, cp->utf2);
                                806              18 :                     continue;
                                807                 :                 }
                                808                 :             }
                                809                 :         }
                                810                 : 
                                811                 :         /* if there's a conversion function, try that */
 2887 tgl                       812              63 :         if (conv_func)
                                813                 :         {
                                814              45 :             uint32      converted = (*conv_func) (iiso);
                                815                 : 
                                816              45 :             if (converted)
                                817                 :             {
                                818              27 :                 utf = store_coded_char(utf, converted);
                                819              27 :                 continue;
                                820                 :             }
                                821                 :         }
                                822                 : 
                                823                 :         /* failed to translate this character */
  738 heikki.linnakangas        824              36 :         iso -= l;
                                825              36 :         if (noError)
                                826              18 :             break;
 2887 tgl                       827              18 :         report_untranslatable_char(encoding, PG_UTF8,
                                828                 :                                    (const char *) iso, len);
                                829                 :     }
                                830                 : 
                                831                 :     /* if we broke out of loop early, must be invalid input */
  738 heikki.linnakangas        832             714 :     if (len > 0 && !noError)
 6167 tgl                       833             171 :         report_invalid_encoding(encoding, (const char *) iso, len);
                                834                 : 
 8214 ishii                     835             543 :     *utf = '\0';
                                836                 : 
  738 heikki.linnakangas        837             543 :     return iso - start;
                                838                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a