LCOV - differential code coverage report
Current view: top level - src/backend/tsearch - ts_locale.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GIC GNC CBC EUB ECB
Current: Differential Code Coverage HEAD vs 15 Lines: 79.2 % 101 80 2 1 10 8 42 5 33 13 46
Current Date: 2023-04-08 15:15:32 Functions: 90.9 % 11 10 1 6 1 3 1 7
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * ts_locale.c
       4                 :  *      locale compatibility layer for tsearch
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  *
       9                 :  * IDENTIFICATION
      10                 :  *    src/backend/tsearch/ts_locale.c
      11                 :  *
      12                 :  *-------------------------------------------------------------------------
      13                 :  */
      14                 : #include "postgres.h"
      15                 : 
      16                 : #include "catalog/pg_collation.h"
      17                 : #include "common/string.h"
      18                 : #include "storage/fd.h"
      19                 : #include "tsearch/ts_locale.h"
      20                 : #include "tsearch/ts_public.h"
      21                 : 
      22                 : static void tsearch_readline_callback(void *arg);
      23                 : 
      24                 : 
      25                 : /*
      26                 :  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
      27                 :  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
      28                 :  * getting from char2wchar() is UTF16 not UTF32.  A single input character
      29                 :  * may therefore produce a surrogate pair rather than just one wchar_t;
      30                 :  * we also need room for a trailing null.  When we do get a surrogate pair,
      31                 :  * we pass just the first code to iswdigit() etc, so that these functions will
      32                 :  * always return false for characters outside the Basic Multilingual Plane.
      33                 :  */
      34                 : #define WC_BUF_LEN  3
      35                 : 
      36                 : int
      37 CBC       10302 : t_isdigit(const char *ptr)
      38                 : {
      39           10302 :     int         clen = pg_mblen(ptr);
      40                 :     wchar_t     character[WC_BUF_LEN];
      41           10302 :     pg_locale_t mylocale = 0;   /* TODO */
      42                 : 
      43           10302 :     if (clen == 1 || database_ctype_is_c)
      44           10302 :         return isdigit(TOUCHAR(ptr));
      45                 : 
      46 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      47                 : 
      48               0 :     return iswdigit((wint_t) character[0]);
      49                 : }
      50                 : 
      51                 : int
      52 CBC      454400 : t_isspace(const char *ptr)
      53                 : {
      54          454400 :     int         clen = pg_mblen(ptr);
      55                 :     wchar_t     character[WC_BUF_LEN];
      56          454400 :     pg_locale_t mylocale = 0;   /* TODO */
      57                 : 
      58          454400 :     if (clen == 1 || database_ctype_is_c)
      59          454400 :         return isspace(TOUCHAR(ptr));
      60                 : 
      61 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      62                 : 
      63               0 :     return iswspace((wint_t) character[0]);
      64                 : }
      65                 : 
      66                 : int
      67 CBC        5142 : t_isalpha(const char *ptr)
      68                 : {
      69            5142 :     int         clen = pg_mblen(ptr);
      70                 :     wchar_t     character[WC_BUF_LEN];
      71            5142 :     pg_locale_t mylocale = 0;   /* TODO */
      72                 : 
      73            5142 :     if (clen == 1 || database_ctype_is_c)
      74            5142 :         return isalpha(TOUCHAR(ptr));
      75                 : 
      76 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      77                 : 
      78               0 :     return iswalpha((wint_t) character[0]);
      79                 : }
      80                 : 
      81                 : int
      82 GNC     1371889 : t_isalnum(const char *ptr)
      83                 : {
      84         1371889 :     int         clen = pg_mblen(ptr);
      85                 :     wchar_t     character[WC_BUF_LEN];
      86         1371889 :     pg_locale_t mylocale = 0;   /* TODO */
      87                 : 
      88         1371889 :     if (clen == 1 || database_ctype_is_c)
      89         1371889 :         return isalnum(TOUCHAR(ptr));
      90                 : 
      91 UNC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
      92                 : 
      93               0 :     return iswalnum((wint_t) character[0]);
      94                 : }
      95                 : 
      96                 : int
      97 CBC        2075 : t_isprint(const char *ptr)
      98                 : {
      99            2075 :     int         clen = pg_mblen(ptr);
     100                 :     wchar_t     character[WC_BUF_LEN];
     101            2075 :     pg_locale_t mylocale = 0;   /* TODO */
     102                 : 
     103            2075 :     if (clen == 1 || database_ctype_is_c)
     104            2075 :         return isprint(TOUCHAR(ptr));
     105                 : 
     106 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
     107                 : 
     108               0 :     return iswprint((wint_t) character[0]);
     109                 : }
     110                 : 
     111                 : 
     112 ECB             : /*
     113                 :  * Set up to read a file using tsearch_readline().  This facility is
     114                 :  * better than just reading the file directly because it provides error
     115                 :  * context pointing to the specific line where a problem is detected.
     116                 :  *
     117                 :  * Expected usage is:
     118                 :  *
     119                 :  *      tsearch_readline_state trst;
     120                 :  *
     121 EUB             :  *      if (!tsearch_readline_begin(&trst, filename))
     122                 :  *          ereport(ERROR,
     123                 :  *                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
     124                 :  *                   errmsg("could not open stop-word file \"%s\": %m",
     125                 :  *                          filename)));
     126                 :  *      while ((line = tsearch_readline(&trst)) != NULL)
     127                 :  *          process line;
     128                 :  *      tsearch_readline_end(&trst);
     129                 :  *
     130                 :  * Note that the caller supplies the ereport() for file open failure;
     131                 :  * this is so that a custom message can be provided.  The filename string
     132                 :  * passed to tsearch_readline_begin() must remain valid through
     133                 :  * tsearch_readline_end().
     134                 :  */
     135                 : bool
     136 GIC         278 : tsearch_readline_begin(tsearch_readline_state *stp,
     137                 :                        const char *filename)
     138                 : {
     139             278 :     if ((stp->fp = AllocateFile(filename, "r")) == NULL)
     140 UIC           0 :         return false;
     141 GIC         278 :     stp->filename = filename;
     142             278 :     stp->lineno = 0;
     143             278 :     initStringInfo(&stp->buf);
     144             278 :     stp->curline = NULL;
     145                 :     /* Setup error traceback support for ereport() */
     146             278 :     stp->cb.callback = tsearch_readline_callback;
     147             278 :     stp->cb.arg = (void *) stp;
     148             278 :     stp->cb.previous = error_context_stack;
     149             278 :     error_context_stack = &stp->cb;
     150             278 :     return true;
     151 ECB             : }
     152                 : 
     153                 : /*
     154                 :  * Read the next line from a tsearch data file (expected to be in UTF-8), and
     155 EUB             :  * convert it to database encoding if needed. The returned string is palloc'd.
     156 ECB             :  * NULL return means EOF.
     157                 :  */
     158                 : char *
     159 CBC       10711 : tsearch_readline(tsearch_readline_state *stp)
     160                 : {
     161 ECB             :     char       *recoded;
     162                 : 
     163                 :     /* Advance line number to use in error reports */
     164 CBC       10711 :     stp->lineno++;
     165 ECB             : 
     166                 :     /* Clear curline, it's no longer relevant */
     167 GIC       10711 :     if (stp->curline)
     168                 :     {
     169           10433 :         if (stp->curline != stp->buf.data)
     170 UIC           0 :             pfree(stp->curline);
     171 GIC       10433 :         stp->curline = NULL;
     172                 :     }
     173                 : 
     174 ECB             :     /* Collect next line, if there is one */
     175 GIC       10711 :     if (!pg_get_line_buf(stp->fp, &stp->buf))
     176             235 :         return NULL;
     177                 : 
     178                 :     /* Validate the input as UTF-8, then convert to DB encoding if needed */
     179 CBC       10476 :     recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
     180                 : 
     181                 :     /* Save the correctly-encoded string for possible error reports */
     182           10476 :     stp->curline = recoded;      /* might be equal to buf.data */
     183                 : 
     184 ECB             :     /*
     185 EUB             :      * We always return a freshly pstrdup'd string.  This is clearly necessary
     186 ECB             :      * if pg_any_to_server() returned buf.data, and we need a second copy even
     187                 :      * if encoding conversion did occur.  The caller is entitled to pfree the
     188                 :      * returned string at any time, which would leave curline pointing to
     189                 :      * recycled storage, causing problems if an error occurs after that point.
     190                 :      * (It's preferable to return the result of pstrdup instead of the output
     191                 :      * of pg_any_to_server, because the conversion result tends to be
     192                 :      * over-allocated.  Since callers might save the result string directly
     193                 :      * into a long-lived dictionary structure, we don't want it to be a larger
     194                 :      * palloc chunk than necessary.  We'll reclaim the conversion result on
     195                 :      * the next call.)
     196                 :      */
     197 CBC       10476 :     return pstrdup(recoded);
     198                 : }
     199                 : 
     200                 : /*
     201                 :  * Close down after reading a file with tsearch_readline()
     202                 :  */
     203                 : void
     204 GIC         278 : tsearch_readline_end(tsearch_readline_state *stp)
     205                 : {
     206                 :     /* Suppress use of curline in any error reported below */
     207             278 :     if (stp->curline)
     208                 :     {
     209              43 :         if (stp->curline != stp->buf.data)
     210 UIC           0 :             pfree(stp->curline);
     211 GIC          43 :         stp->curline = NULL;
     212 ECB             :     }
     213                 : 
     214                 :     /* Release other resources */
     215 GIC         278 :     pfree(stp->buf.data);
     216             278 :     FreeFile(stp->fp);
     217                 : 
     218                 :     /* Pop the error context stack */
     219 CBC         278 :     error_context_stack = stp->cb.previous;
     220 GIC         278 : }
     221                 : 
     222 ECB             : /*
     223                 :  * Error context callback for errors occurring while reading a tsearch
     224                 :  * configuration file.
     225 EUB             :  */
     226 ECB             : static void
     227 UIC           0 : tsearch_readline_callback(void *arg)
     228                 : {
     229               0 :     tsearch_readline_state *stp = (tsearch_readline_state *) arg;
     230 ECB             : 
     231                 :     /*
     232                 :      * We can't include the text of the config line for errors that occur
     233                 :      * during tsearch_readline() itself.  The major cause of such errors is
     234                 :      * encoding violations, and we daren't try to print error messages
     235                 :      * containing badly-encoded data.
     236                 :      */
     237 UIC           0 :     if (stp->curline)
     238               0 :         errcontext("line %d of configuration file \"%s\": \"%s\"",
     239                 :                    stp->lineno,
     240                 :                    stp->filename,
     241                 :                    stp->curline);
     242 EUB             :     else
     243 UIC           0 :         errcontext("line %d of configuration file \"%s\"",
     244 EUB             :                    stp->lineno,
     245                 :                    stp->filename);
     246 UIC           0 : }
     247                 : 
     248                 : 
     249                 : /*
     250                 :  * lowerstr --- fold null-terminated string to lower case
     251                 :  *
     252 EUB             :  * Returned string is palloc'd
     253                 :  */
     254                 : char *
     255 GIC        6922 : lowerstr(const char *str)
     256                 : {
     257            6922 :     return lowerstr_with_len(str, strlen(str));
     258 EUB             : }
     259                 : 
     260                 : /*
     261                 :  * lowerstr_with_len --- fold string to lower case
     262                 :  *
     263                 :  * Input string need not be null-terminated.
     264                 :  *
     265                 :  * Returned string is palloc'd
     266                 :  */
     267                 : char *
     268 GIC      140846 : lowerstr_with_len(const char *str, int len)
     269                 : {
     270 ECB             :     char       *out;
     271 GIC      140846 :     pg_locale_t mylocale = 0;   /* TODO */
     272 ECB             : 
     273 GIC      140846 :     if (len == 0)
     274 UIC           0 :         return pstrdup("");
     275                 : 
     276                 :     /*
     277                 :      * Use wide char code only when max encoding length > 1 and ctype != C.
     278                 :      * Some operating systems fail with multi-byte encodings and a C locale.
     279                 :      * Also, for a C locale there is no need to process as multibyte. From
     280                 :      * backend/utils/adt/oracle_compat.c Teodor
     281                 :      */
     282 GIC      140846 :     if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
     283 CBC      135993 :     {
     284                 :         wchar_t    *wstr,
     285                 :                    *wptr;
     286 ECB             :         int         wlen;
     287                 : 
     288                 :         /*
     289 EUB             :          * alloc number of wchar_t for worst case, len contains number of
     290                 :          * bytes >= number of characters and alloc 1 wchar_t for 0, because
     291                 :          * wchar2char wants zero-terminated string
     292                 :          */
     293 GIC      135993 :         wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
     294                 : 
     295          135993 :         wlen = char2wchar(wstr, len + 1, str, len, mylocale);
     296          135993 :         Assert(wlen <= len);
     297 ECB             : 
     298 CBC     1140745 :         while (*wptr)
     299                 :         {
     300 GIC     1004752 :             *wptr = towlower((wint_t) *wptr);
     301         1004752 :             wptr++;
     302                 :         }
     303                 : 
     304                 :         /*
     305                 :          * Alloc result string for worst case + '\0'
     306                 :          */
     307          135993 :         len = pg_database_encoding_max_length() * wlen + 1;
     308 CBC      135993 :         out = (char *) palloc(len);
     309                 : 
     310          135993 :         wlen = wchar2char(out, wstr, len, mylocale);
     311 ECB             : 
     312 GIC      135993 :         pfree(wstr);
     313 ECB             : 
     314 GIC      135993 :         if (wlen < 0)
     315 LBC           0 :             ereport(ERROR,
     316 ECB             :                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     317                 :                      errmsg("conversion from wchar_t to server encoding failed: %m")));
     318 GIC      135993 :         Assert(wlen < len);
     319                 :     }
     320                 :     else
     321                 :     {
     322 CBC        4853 :         const char *ptr = str;
     323 ECB             :         char       *outptr;
     324                 : 
     325 CBC        4853 :         outptr = out = (char *) palloc(sizeof(char) * (len + 1));
     326 GIC       28192 :         while ((ptr - str) < len && *ptr)
     327 ECB             :         {
     328 GIC       23339 :             *outptr++ = tolower(TOUCHAR(ptr));
     329 CBC       23339 :             ptr++;
     330 EUB             :         }
     331 GIC        4853 :         *outptr = '\0';
     332                 :     }
     333 ECB             : 
     334 GIC      140846 :     return out;
     335                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a