|  Age         Owner                  TLA  Line data    Source code 
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * ts_locale.c
                                  4                 :  *      locale compatibility layer for tsearch
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  *
                                  9                 :  * IDENTIFICATION
                                 10                 :  *    src/backend/tsearch/ts_locale.c
                                 11                 :  *
                                 12                 :  *-------------------------------------------------------------------------
                                 13                 :  */
                                 14                 : #include "postgres.h"
                                 15                 : 
                                 16                 : #include "catalog/pg_collation.h"
                                 17                 : #include "common/string.h"
                                 18                 : #include "storage/fd.h"
                                 19                 : #include "tsearch/ts_locale.h"
                                 20                 : #include "tsearch/ts_public.h"
                                 21                 : 
                                 22                 : static void tsearch_readline_callback(void *arg);
                                 23                 : 
                                 24                 : 
                                 25                 : /*
                                 26                 :  * The reason these functions use a 3-wchar_t output buffer, not 2 as you
                                 27                 :  * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
                                 28                 :  * getting from char2wchar() is UTF16 not UTF32.  A single input character
                                 29                 :  * may therefore produce a surrogate pair rather than just one wchar_t;
                                 30                 :  * we also need room for a trailing null.  When we do get a surrogate pair,
                                 31                 :  * we pass just the first code to iswdigit() etc, so that these functions will
                                 32                 :  * always return false for characters outside the Basic Multilingual Plane.
                                 33                 :  */
                                 34                 : #define WC_BUF_LEN  3
                                 35                 : 
                                 36                 : int
 5630 tgl                        37 CBC       10302 : t_isdigit(const char *ptr)
                                 38                 : {
                                 39           10302 :     int         clen = pg_mblen(ptr);
                                 40                 :     wchar_t     character[WC_BUF_LEN];
 4322 bruce                      41           10302 :     pg_locale_t mylocale = 0;   /* TODO */
                                 42                 : 
   23 jdavis                     43           10302 :     if (clen == 1 || database_ctype_is_c)
 5630 tgl                        44           10302 :         return isdigit(TOUCHAR(ptr));
                                 45                 : 
 1618 tgl                        46 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
                                 47                 : 
 5630                            48               0 :     return iswdigit((wint_t) character[0]);
                                 49                 : }
                                 50                 : 
                                 51                 : int
 5630 tgl                        52 CBC      454400 : t_isspace(const char *ptr)
                                 53                 : {
                                 54          454400 :     int         clen = pg_mblen(ptr);
                                 55                 :     wchar_t     character[WC_BUF_LEN];
 4322 bruce                      56          454400 :     pg_locale_t mylocale = 0;   /* TODO */
                                 57                 : 
   23 jdavis                     58          454400 :     if (clen == 1 || database_ctype_is_c)
 5630 tgl                        59          454400 :         return isspace(TOUCHAR(ptr));
                                 60                 : 
 1618 tgl                        61 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
                                 62                 : 
 5630                            63               0 :     return iswspace((wint_t) character[0]);
                                 64                 : }
                                 65                 : 
                                 66                 : int
 5630 tgl                        67 CBC        5142 : t_isalpha(const char *ptr)
                                 68                 : {
                                 69            5142 :     int         clen = pg_mblen(ptr);
                                 70                 :     wchar_t     character[WC_BUF_LEN];
 4322 bruce                      71            5142 :     pg_locale_t mylocale = 0;   /* TODO */
                                 72                 : 
   23 jdavis                     73            5142 :     if (clen == 1 || database_ctype_is_c)
 5710 tgl                        74            5142 :         return isalpha(TOUCHAR(ptr));
                                 75                 : 
 1618 tgl                        76 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
                                 77                 : 
 5630                            78               0 :     return iswalpha((wint_t) character[0]);
                                 79                 : }
                                 80                 : 
                                 81                 : int
  185 tgl                        82 GNC     1371889 : t_isalnum(const char *ptr)
                                 83                 : {
                                 84         1371889 :     int         clen = pg_mblen(ptr);
                                 85                 :     wchar_t     character[WC_BUF_LEN];
                                 86         1371889 :     pg_locale_t mylocale = 0;   /* TODO */
                                 87                 : 
   23 jdavis                     88         1371889 :     if (clen == 1 || database_ctype_is_c)
  185 tgl                        89         1371889 :         return isalnum(TOUCHAR(ptr));
                                 90                 : 
  185 tgl                        91 UNC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
                                 92                 : 
                                 93               0 :     return iswalnum((wint_t) character[0]);
                                 94                 : }
                                 95                 : 
                                 96                 : int
 5630 tgl                        97 CBC        2075 : t_isprint(const char *ptr)
                                 98                 : {
                                 99            2075 :     int         clen = pg_mblen(ptr);
                                100                 :     wchar_t     character[WC_BUF_LEN];
 4322 bruce                     101            2075 :     pg_locale_t mylocale = 0;   /* TODO */
                                102                 : 
   23 jdavis                    103            2075 :     if (clen == 1 || database_ctype_is_c)
 5710 tgl                       104            2075 :         return isprint(TOUCHAR(ptr));
                                105                 : 
 1618 tgl                       106 UBC           0 :     char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
                                107                 : 
 5630                           108               0 :     return iswprint((wint_t) character[0]);
                                109                 : }
                                110                 : 
                                111                 : 
 5408 tgl                       112 ECB             : /*
                                113                 :  * Set up to read a file using tsearch_readline().  This facility is
                                114                 :  * better than just reading the file directly because it provides error
                                115                 :  * context pointing to the specific line where a problem is detected.
                                116                 :  *
                                117                 :  * Expected usage is:
                                118                 :  *
                                119                 :  *      tsearch_readline_state trst;
                                120                 :  *
 5408 tgl                       121 EUB             :  *      if (!tsearch_readline_begin(&trst, filename))
                                122                 :  *          ereport(ERROR,
                                123                 :  *                  (errcode(ERRCODE_CONFIG_FILE_ERROR),
                                124                 :  *                   errmsg("could not open stop-word file \"%s\": %m",
                                125                 :  *                          filename)));
                                126                 :  *      while ((line = tsearch_readline(&trst)) != NULL)
                                127                 :  *          process line;
                                128                 :  *      tsearch_readline_end(&trst);
                                129                 :  *
                                130                 :  * Note that the caller supplies the ereport() for file open failure;
                                131                 :  * this is so that a custom message can be provided.  The filename string
                                132                 :  * passed to tsearch_readline_begin() must remain valid through
                                133                 :  * tsearch_readline_end().
                                134                 :  */
                                135                 : bool
 5408 tgl                       136 GIC         278 : tsearch_readline_begin(tsearch_readline_state *stp,
                                137                 :                        const char *filename)
                                138                 : {
                                139             278 :     if ((stp->fp = AllocateFile(filename, "r")) == NULL)
 5408 tgl                       140 UIC           0 :         return false;
 5408 tgl                       141 GIC         278 :     stp->filename = filename;
                                142             278 :     stp->lineno = 0;
  928                           143             278 :     initStringInfo(&stp->buf);
 5408                           144             278 :     stp->curline = NULL;
                                145                 :     /* Setup error traceback support for ereport() */
                                146             278 :     stp->cb.callback = tsearch_readline_callback;
                                147             278 :     stp->cb.arg = (void *) stp;
                                148             278 :     stp->cb.previous = error_context_stack;
                                149             278 :     error_context_stack = &stp->cb;
                                150             278 :     return true;
 5408 tgl                       151 ECB             : }
                                152                 : 
                                153                 : /*
 5706                           154                 :  * Read the next line from a tsearch data file (expected to be in UTF-8), and
 5706 tgl                       155 EUB             :  * convert it to database encoding if needed. The returned string is palloc'd.
 5706 tgl                       156 ECB             :  * NULL return means EOF.
 5710                           157                 :  */
                                158                 : char *
 5408 tgl                       159 CBC       10711 : tsearch_readline(tsearch_readline_state *stp)
                                160                 : {
  928 tgl                       161 ECB             :     char       *recoded;
 5408                           162                 : 
  928                           163                 :     /* Advance line number to use in error reports */
 5408 tgl                       164 CBC       10711 :     stp->lineno++;
  928 tgl                       165 ECB             : 
                                166                 :     /* Clear curline, it's no longer relevant */
  928 tgl                       167 GIC       10711 :     if (stp->curline)
                                168                 :     {
                                169           10433 :         if (stp->curline != stp->buf.data)
  928 tgl                       170 UIC           0 :             pfree(stp->curline);
  928 tgl                       171 GIC       10433 :         stp->curline = NULL;
                                172                 :     }
                                173                 : 
  928 tgl                       174 ECB             :     /* Collect next line, if there is one */
  928 tgl                       175 GIC       10711 :     if (!pg_get_line_buf(stp->fp, &stp->buf))
                                176             235 :         return NULL;
                                177                 : 
                                178                 :     /* Validate the input as UTF-8, then convert to DB encoding if needed */
  928 tgl                       179 CBC       10476 :     recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
                                180                 : 
                                181                 :     /* Save the correctly-encoded string for possible error reports */
                                182           10476 :     stp->curline = recoded;      /* might be equal to buf.data */
                                183                 : 
  928 tgl                       184 ECB             :     /*
  928 tgl                       185 EUB             :      * We always return a freshly pstrdup'd string.  This is clearly necessary
  928 tgl                       186 ECB             :      * if pg_any_to_server() returned buf.data, and we need a second copy even
                                187                 :      * if encoding conversion did occur.  The caller is entitled to pfree the
                                188                 :      * returned string at any time, which would leave curline pointing to
                                189                 :      * recycled storage, causing problems if an error occurs after that point.
                                190                 :      * (It's preferable to return the result of pstrdup instead of the output
                                191                 :      * of pg_any_to_server, because the conversion result tends to be
                                192                 :      * over-allocated.  Since callers might save the result string directly
                                193                 :      * into a long-lived dictionary structure, we don't want it to be a larger
                                194                 :      * palloc chunk than necessary.  We'll reclaim the conversion result on
                                195                 :      * the next call.)
                                196                 :      */
  928 tgl                       197 CBC       10476 :     return pstrdup(recoded);
                                198                 : }
                                199                 : 
                                200                 : /*
                                201                 :  * Close down after reading a file with tsearch_readline()
                                202                 :  */
                                203                 : void
 5408 tgl                       204 GIC         278 : tsearch_readline_end(tsearch_readline_state *stp)
                                205                 : {
                                206                 :     /* Suppress use of curline in any error reported below */
  928                           207             278 :     if (stp->curline)
                                208                 :     {
                                209              43 :         if (stp->curline != stp->buf.data)
  928 tgl                       210 UIC           0 :             pfree(stp->curline);
  928 tgl                       211 GIC          43 :         stp->curline = NULL;
  928 tgl                       212 ECB             :     }
                                213                 : 
                                214                 :     /* Release other resources */
  928 tgl                       215 GIC         278 :     pfree(stp->buf.data);
 5408                           216             278 :     FreeFile(stp->fp);
                                217                 : 
                                218                 :     /* Pop the error context stack */
 5408 tgl                       219 CBC         278 :     error_context_stack = stp->cb.previous;
 5408 tgl                       220 GIC         278 : }
                                221                 : 
 5408 tgl                       222 ECB             : /*
                                223                 :  * Error context callback for errors occurring while reading a tsearch
                                224                 :  * configuration file.
 5408 tgl                       225 EUB             :  */
 5408 tgl                       226 ECB             : static void
 5408 tgl                       227 UIC           0 : tsearch_readline_callback(void *arg)
                                228                 : {
                                229               0 :     tsearch_readline_state *stp = (tsearch_readline_state *) arg;
 5408 tgl                       230 ECB             : 
                                231                 :     /*
                                232                 :      * We can't include the text of the config line for errors that occur
                                233                 :      * during tsearch_readline() itself.  The major cause of such errors is
 5050 bruce                     234                 :      * encoding violations, and we daren't try to print error messages
                                235                 :      * containing badly-encoded data.
                                236                 :      */
 5408 tgl                       237 UIC           0 :     if (stp->curline)
                                238               0 :         errcontext("line %d of configuration file \"%s\": \"%s\"",
                                239                 :                    stp->lineno,
                                240                 :                    stp->filename,
                                241                 :                    stp->curline);
 5408 tgl                       242 EUB             :     else
 5408 tgl                       243 UIC           0 :         errcontext("line %d of configuration file \"%s\"",
 5408 tgl                       244 EUB             :                    stp->lineno,
                                245                 :                    stp->filename);
 5408 tgl                       246 UIC           0 : }
                                247                 : 
                                248                 : 
                                249                 : /*
                                250                 :  * lowerstr --- fold null-terminated string to lower case
                                251                 :  *
 5630 tgl                       252 EUB             :  * Returned string is palloc'd
                                253                 :  */
                                254                 : char *
 5630 tgl                       255 GIC        6922 : lowerstr(const char *str)
                                256                 : {
 5710                           257            6922 :     return lowerstr_with_len(str, strlen(str));
 5710 tgl                       258 EUB             : }
                                259                 : 
                                260                 : /*
 5630                           261                 :  * lowerstr_with_len --- fold string to lower case
                                262                 :  *
                                263                 :  * Input string need not be null-terminated.
                                264                 :  *
                                265                 :  * Returned string is palloc'd
                                266                 :  */
                                267                 : char *
 5630 tgl                       268 GIC      140846 : lowerstr_with_len(const char *str, int len)
                                269                 : {
 5710 tgl                       270 ECB             :     char       *out;
 4322 bruce                     271 GIC      140846 :     pg_locale_t mylocale = 0;   /* TODO */
 5710 tgl                       272 ECB             : 
 5710 tgl                       273 GIC      140846 :     if (len == 0)
 5710 tgl                       274 UIC           0 :         return pstrdup("");
                                275                 : 
                                276                 :     /*
                                277                 :      * Use wide char code only when max encoding length > 1 and ctype != C.
                                278                 :      * Some operating systems fail with multi-byte encodings and a C locale.
                                279                 :      * Also, for a C locale there is no need to process as multibyte. From
                                280                 :      * backend/utils/adt/oracle_compat.c Teodor
                                281                 :      */
   23 jdavis                    282 GIC      140846 :     if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
 5710 tgl                       283 CBC      135993 :     {
                                284                 :         wchar_t    *wstr,
                                285                 :                    *wptr;
 5710 tgl                       286 ECB             :         int         wlen;
                                287                 : 
                                288                 :         /*
 5710 tgl                       289 EUB             :          * alloc number of wchar_t for worst case, len contains number of
                                290                 :          * bytes >= number of characters and alloc 1 wchar_t for 0, because
                                291                 :          * wchar2char wants zero-terminated string
                                292                 :          */
 5710 tgl                       293 GIC      135993 :         wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
                                294                 : 
 4369                           295          135993 :         wlen = char2wchar(wstr, len + 1, str, len, mylocale);
 5710                           296          135993 :         Assert(wlen <= len);
 5710 tgl                       297 ECB             : 
 5710 tgl                       298 CBC     1140745 :         while (*wptr)
                                299                 :         {
 5710 tgl                       300 GIC     1004752 :             *wptr = towlower((wint_t) *wptr);
                                301         1004752 :             wptr++;
                                302                 :         }
                                303                 : 
                                304                 :         /*
                                305                 :          * Alloc result string for worst case + '\0'
                                306                 :          */
 5630                           307          135993 :         len = pg_database_encoding_max_length() * wlen + 1;
 5710 tgl                       308 CBC      135993 :         out = (char *) palloc(len);
                                309                 : 
 4369                           310          135993 :         wlen = wchar2char(out, wstr, len, mylocale);
 5630 tgl                       311 ECB             : 
 5710 tgl                       312 GIC      135993 :         pfree(wstr);
 5710 tgl                       313 ECB             : 
 5710 tgl                       314 GIC      135993 :         if (wlen < 0)
 5710 tgl                       315 LBC           0 :             ereport(ERROR,
 5710 tgl                       316 ECB             :                     (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
                                317                 :                      errmsg("conversion from wchar_t to server encoding failed: %m")));
 5630 tgl                       318 GIC      135993 :         Assert(wlen < len);
                                319                 :     }
                                320                 :     else
                                321                 :     {
 5630 tgl                       322 CBC        4853 :         const char *ptr = str;
 5710 tgl                       323 ECB             :         char       *outptr;
                                324                 : 
 5710 tgl                       325 CBC        4853 :         outptr = out = (char *) palloc(sizeof(char) * (len + 1));
 5630 tgl                       326 GIC       28192 :         while ((ptr - str) < len && *ptr)
 5710 tgl                       327 ECB             :         {
 5630 tgl                       328 GIC       23339 :             *outptr++ = tolower(TOUCHAR(ptr));
 5710 tgl                       329 CBC       23339 :             ptr++;
 5710 tgl                       330 EUB             :         }
 5710 tgl                       331 GIC        4853 :         *outptr = '\0';
                                332                 :     }
 5710 tgl                       333 ECB             : 
 5710 tgl                       334 GIC      140846 :     return out;
                                335                 : }
         |