LCOV - differential code coverage report
Current view: top level - src/fe_utils - mbprint.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 76.7 % 176 135 41 135
Current Date: 2023-04-08 15:15:32 Functions: 87.5 % 8 7 1 7
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * Multibyte character printing support for frontend code
       4                 :  *
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  * src/fe_utils/mbprint.c
      10                 :  *
      11                 :  *-------------------------------------------------------------------------
      12                 :  */
      13                 : #include "postgres_fe.h"
      14                 : 
      15                 : #include "fe_utils/mbprint.h"
      16                 : 
      17                 : #include "libpq-fe.h"
      18                 : 
      19                 : 
      20                 : /*
      21                 :  * To avoid version-skew problems, this file must not use declarations
      22                 :  * from pg_wchar.h: the encoding IDs we are dealing with are determined
      23                 :  * by the libpq.so we are linked with, and that might not match the
      24                 :  * numbers we see at compile time.  (If this file were inside libpq,
      25                 :  * the problem would go away...)
      26                 :  *
      27                 :  * Hence, we have our own definition of pg_wchar, and we get the values
      28                 :  * of any needed encoding IDs on-the-fly.
      29                 :  */
      30                 : 
      31                 : typedef unsigned int pg_wchar;
      32                 : 
      33                 : static int
      34 CBC     4605072 : pg_get_utf8_id(void)
      35                 : {
      36                 :     static int  utf8_id = -1;
      37                 : 
      38         4605072 :     if (utf8_id < 0)
      39            3991 :         utf8_id = pg_char_to_encoding("utf8");
      40         4605072 :     return utf8_id;
      41                 : }
      42                 : 
      43                 : #define PG_UTF8     pg_get_utf8_id()
      44                 : 
      45                 : 
      46                 : /*
      47                 :  * Convert a UTF-8 character to a Unicode code point.
      48                 :  * This is a one-character version of pg_utf2wchar_with_len.
      49                 :  *
      50                 :  * No error checks here, c must point to a long-enough string.
      51                 :  */
      52                 : static pg_wchar
      53 UBC           0 : utf8_to_unicode(const unsigned char *c)
      54                 : {
      55               0 :     if ((*c & 0x80) == 0)
      56               0 :         return (pg_wchar) c[0];
      57               0 :     else if ((*c & 0xe0) == 0xc0)
      58               0 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
      59               0 :                            (c[1] & 0x3f));
      60               0 :     else if ((*c & 0xf0) == 0xe0)
      61               0 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
      62               0 :                            ((c[1] & 0x3f) << 6) |
      63               0 :                            (c[2] & 0x3f));
      64               0 :     else if ((*c & 0xf8) == 0xf0)
      65               0 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
      66               0 :                            ((c[1] & 0x3f) << 12) |
      67               0 :                            ((c[2] & 0x3f) << 6) |
      68               0 :                            (c[3] & 0x3f));
      69                 :     else
      70                 :         /* that is an invalid code on purpose */
      71               0 :         return 0xffffffff;
      72                 : }
      73                 : 
      74                 : 
      75                 : /*
      76                 :  * Unicode 3.1 compliant validation : for each category, it checks the
      77                 :  * combination of each byte to make sure it maps to a valid range. It also
      78                 :  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
      79                 :  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
      80                 :  */
      81                 : static int
      82 CBC    11166761 : utf_charcheck(const unsigned char *c)
      83                 : {
      84        11166761 :     if ((*c & 0x80) == 0)
      85        11166081 :         return 1;
      86             680 :     else if ((*c & 0xe0) == 0xc0)
      87                 :     {
      88                 :         /* two-byte char */
      89             641 :         if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
      90             641 :             return 2;
      91 UBC           0 :         return -1;
      92                 :     }
      93 CBC          39 :     else if ((*c & 0xf0) == 0xe0)
      94                 :     {
      95                 :         /* three-byte char */
      96              27 :         if (((c[1] & 0xc0) == 0x80) &&
      97              27 :             (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
      98              27 :             ((c[2] & 0xc0) == 0x80))
      99                 :         {
     100              27 :             int         z = c[0] & 0x0f;
     101              27 :             int         yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
     102              27 :             int         lx = yx & 0x7f;
     103                 : 
     104                 :             /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
     105              27 :             if (((z == 0x0f) &&
     106 UBC           0 :                  (((yx & 0xffe) == 0xffe) ||
     107 CBC          27 :                   (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
     108 UBC           0 :                 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
     109               0 :                 return -1;
     110 CBC          27 :             return 3;
     111                 :         }
     112 UBC           0 :         return -1;
     113                 :     }
     114 CBC          12 :     else if ((*c & 0xf8) == 0xf0)
     115                 :     {
     116              12 :         int         u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
     117                 : 
     118                 :         /* four-byte char */
     119              12 :         if (((c[1] & 0xc0) == 0x80) &&
     120              12 :             (u > 0x00) && (u <= 0x10) &&
     121              12 :             ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
     122                 :         {
     123                 :             /* test for 0xzzzzfffe/0xzzzzfffff */
     124              12 :             if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
     125 UBC           0 :                 ((c[3] & 0x3e) == 0x3e))
     126               0 :                 return -1;
     127 CBC          12 :             return 4;
     128                 :         }
     129 UBC           0 :         return -1;
     130                 :     }
     131               0 :     return -1;
     132                 : }
     133                 : 
     134                 : 
     135                 : static void
     136 CBC     4603897 : mb_utf_validate(unsigned char *pwcs)
     137                 : {
     138         4603897 :     unsigned char *p = pwcs;
     139                 : 
     140        15770658 :     while (*pwcs)
     141                 :     {
     142                 :         int         len;
     143                 : 
     144        11166761 :         if ((len = utf_charcheck(pwcs)) > 0)
     145                 :         {
     146        11166761 :             if (p != pwcs)
     147                 :             {
     148                 :                 int         i;
     149                 : 
     150 UBC           0 :                 for (i = 0; i < len; i++)
     151               0 :                     *p++ = *pwcs++;
     152                 :             }
     153                 :             else
     154                 :             {
     155 CBC    11166761 :                 pwcs += len;
     156        11166761 :                 p += len;
     157                 :             }
     158                 :         }
     159                 :         else
     160                 :             /* we skip the char */
     161 UBC           0 :             pwcs++;
     162                 :     }
     163 CBC     4603897 :     if (p != pwcs)
     164 UBC           0 :         *p = '\0';
     165 CBC     4603897 : }
     166                 : 
     167                 : /*
     168                 :  * public functions : wcswidth and mbvalidate
     169                 :  */
     170                 : 
     171                 : /*
     172                 :  * pg_wcswidth is the dumb display-width function.
     173                 :  * It assumes that everything will appear on one line.
     174                 :  * OTOH it is easier to use than pg_wcssize if this applies to you.
     175                 :  */
     176                 : int
     177            1607 : pg_wcswidth(const char *pwcs, size_t len, int encoding)
     178                 : {
     179            1607 :     int         width = 0;
     180                 : 
     181           16033 :     while (len > 0)
     182                 :     {
     183                 :         int         chlen,
     184                 :                     chwidth;
     185                 : 
     186           14426 :         chlen = PQmblen(pwcs, encoding);
     187           14426 :         if (len < (size_t) chlen)
     188 UBC           0 :             break;              /* Invalid string */
     189                 : 
     190 CBC       14426 :         chwidth = PQdsplen(pwcs, encoding);
     191           14426 :         if (chwidth > 0)
     192           14426 :             width += chwidth;
     193                 : 
     194           14426 :         pwcs += chlen;
     195           14426 :         len -= chlen;
     196                 :     }
     197            1607 :     return width;
     198                 : }
     199                 : 
     200                 : /*
     201                 :  * pg_wcssize takes the given string in the given encoding and returns three
     202                 :  * values:
     203                 :  *    result_width: Width in display characters of the longest line in string
     204                 :  *    result_height: Number of lines in display output
     205                 :  *    result_format_size: Number of bytes required to store formatted
     206                 :  *      representation of string
     207                 :  *
     208                 :  * This MUST be kept in sync with pg_wcsformat!
     209                 :  */
     210                 : void
     211         1084666 : pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
     212                 :            int *result_width, int *result_height, int *result_format_size)
     213                 : {
     214                 :     int         w,
     215         1084666 :                 chlen = 0,
     216         1084666 :                 linewidth = 0;
     217         1084666 :     int         width = 0;
     218         1084666 :     int         height = 1;
     219         1084666 :     int         format_size = 0;
     220                 : 
     221        13050549 :     for (; *pwcs && len > 0; pwcs += chlen)
     222                 :     {
     223        11965883 :         chlen = PQmblen((const char *) pwcs, encoding);
     224        11965883 :         if (len < (size_t) chlen)
     225 UBC           0 :             break;
     226 CBC    11965883 :         w = PQdsplen((const char *) pwcs, encoding);
     227                 : 
     228        11965883 :         if (chlen == 1)         /* single-byte char */
     229                 :         {
     230        11964523 :             if (*pwcs == '\n')  /* Newline */
     231                 :             {
     232           17929 :                 if (linewidth > width)
     233            4324 :                     width = linewidth;
     234           17929 :                 linewidth = 0;
     235           17929 :                 height += 1;
     236           17929 :                 format_size += 1;   /* For NUL char */
     237                 :             }
     238        11946594 :             else if (*pwcs == '\r') /* Linefeed */
     239                 :             {
     240               8 :                 linewidth += 2;
     241               8 :                 format_size += 2;
     242                 :             }
     243        11946586 :             else if (*pwcs == '\t') /* Tab */
     244                 :             {
     245                 :                 do
     246                 :                 {
     247            1470 :                     linewidth++;
     248            1470 :                     format_size++;
     249            1470 :                 } while (linewidth % 8 != 0);
     250                 :             }
     251        11946398 :             else if (w < 0)      /* Other control char */
     252                 :             {
     253              72 :                 linewidth += 4;
     254              72 :                 format_size += 4;
     255                 :             }
     256                 :             else                /* Output it as-is */
     257                 :             {
     258        11946326 :                 linewidth += w;
     259        11946326 :                 format_size += 1;
     260                 :             }
     261                 :         }
     262            1360 :         else if (w < 0)          /* Non-ascii control char */
     263                 :         {
     264 UBC           0 :             linewidth += 6;     /* \u0000 */
     265               0 :             format_size += 6;
     266                 :         }
     267                 :         else                    /* All other chars */
     268                 :         {
     269 CBC        1360 :             linewidth += w;
     270            1360 :             format_size += chlen;
     271                 :         }
     272        11965883 :         len -= chlen;
     273                 :     }
     274         1084666 :     if (linewidth > width)
     275         1008008 :         width = linewidth;
     276         1084666 :     format_size += 1;           /* For NUL char */
     277                 : 
     278                 :     /* Set results */
     279         1084666 :     if (result_width)
     280         1084666 :         *result_width = width;
     281         1084666 :     if (result_height)
     282         1084666 :         *result_height = height;
     283         1084666 :     if (result_format_size)
     284         1082243 :         *result_format_size = format_size;
     285         1084666 : }
     286                 : 
     287                 : /*
     288                 :  *  Format a string into one or more "struct lineptr" lines.
     289                 :  *  lines[i].ptr == NULL indicates the end of the array.
     290                 :  *
     291                 :  * This MUST be kept in sync with pg_wcssize!
     292                 :  */
     293                 : void
     294          597013 : pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
     295                 :              struct lineptr *lines, int count)
     296                 : {
     297                 :     int         w,
     298          597013 :                 chlen = 0;
     299          597013 :     int         linewidth = 0;
     300          597013 :     unsigned char *ptr = lines->ptr; /* Pointer to data area */
     301                 : 
     302         6954694 :     for (; *pwcs && len > 0; pwcs += chlen)
     303                 :     {
     304         6357681 :         chlen = PQmblen((const char *) pwcs, encoding);
     305         6357681 :         if (len < (size_t) chlen)
     306 UBC           0 :             break;
     307 CBC     6357681 :         w = PQdsplen((const char *) pwcs, encoding);
     308                 : 
     309         6357681 :         if (chlen == 1)         /* single-byte char */
     310                 :         {
     311         6357001 :             if (*pwcs == '\n')  /* Newline */
     312                 :             {
     313            9581 :                 *ptr++ = '\0';
     314            9581 :                 lines->width = linewidth;
     315            9581 :                 linewidth = 0;
     316            9581 :                 lines++;
     317            9581 :                 count--;
     318            9581 :                 if (count <= 0)
     319 UBC           0 :                     exit(1);    /* Screwup */
     320                 : 
     321                 :                 /* make next line point to remaining memory */
     322 CBC        9581 :                 lines->ptr = ptr;
     323                 :             }
     324         6347420 :             else if (*pwcs == '\r') /* Linefeed */
     325                 :             {
     326               4 :                 strcpy((char *) ptr, "\\r");
     327               4 :                 linewidth += 2;
     328               4 :                 ptr += 2;
     329                 :             }
     330         6347416 :             else if (*pwcs == '\t') /* Tab */
     331                 :             {
     332                 :                 do
     333                 :                 {
     334             735 :                     *ptr++ = ' ';
     335             735 :                     linewidth++;
     336             735 :                 } while (linewidth % 8 != 0);
     337                 :             }
     338         6347322 :             else if (w < 0)      /* Other control char */
     339                 :             {
     340              36 :                 sprintf((char *) ptr, "\\x%02X", *pwcs);
     341              36 :                 linewidth += 4;
     342              36 :                 ptr += 4;
     343                 :             }
     344                 :             else                /* Output it as-is */
     345                 :             {
     346         6347286 :                 linewidth += w;
     347         6347286 :                 *ptr++ = *pwcs;
     348                 :             }
     349                 :         }
     350             680 :         else if (w < 0)          /* Non-ascii control char */
     351                 :         {
     352 UBC           0 :             if (encoding == PG_UTF8)
     353               0 :                 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
     354                 :             else
     355                 :             {
     356                 :                 /*
     357                 :                  * This case cannot happen in the current code because only
     358                 :                  * UTF-8 signals multibyte control characters. But we may need
     359                 :                  * to support it at some stage
     360                 :                  */
     361               0 :                 sprintf((char *) ptr, "\\u????");
     362                 :             }
     363               0 :             ptr += 6;
     364               0 :             linewidth += 6;
     365                 :         }
     366                 :         else                    /* All other chars */
     367                 :         {
     368                 :             int         i;
     369                 : 
     370 CBC        2091 :             for (i = 0; i < chlen; i++)
     371            1411 :                 *ptr++ = pwcs[i];
     372             680 :             linewidth += w;
     373                 :         }
     374         6357681 :         len -= chlen;
     375                 :     }
     376          597013 :     lines->width = linewidth;
     377          597013 :     *ptr++ = '\0';              /* Terminate formatted string */
     378                 : 
     379          597013 :     if (count <= 0)
     380 UBC           0 :         exit(1);                /* Screwup */
     381                 : 
     382 CBC      597013 :     (lines + 1)->ptr = NULL; /* terminate line array */
     383          597013 : }
     384                 : 
     385                 : 
     386                 : /*
     387                 :  * Encoding validation: delete any unvalidatable characters from the string
     388                 :  *
     389                 :  * This seems redundant with existing functionality elsewhere?
     390                 :  */
     391                 : unsigned char *
     392         4605072 : mbvalidate(unsigned char *pwcs, int encoding)
     393                 : {
     394         4605072 :     if (encoding == PG_UTF8)
     395         4603897 :         mb_utf_validate(pwcs);
     396                 :     else
     397                 :     {
     398                 :         /*
     399                 :          * other encodings needing validation should add their own routines
     400                 :          * here
     401                 :          */
     402                 :     }
     403                 : 
     404         4605072 :     return pwcs;
     405                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a