TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_locale.c
4 : * locale compatibility layer for tsearch
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_locale.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_collation.h"
17 : #include "common/string.h"
18 : #include "storage/fd.h"
19 : #include "tsearch/ts_locale.h"
20 : #include "tsearch/ts_public.h"
21 :
22 : static void tsearch_readline_callback(void *arg);
23 :
24 :
25 : /*
26 : * The reason these functions use a 3-wchar_t output buffer, not 2 as you
27 : * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
28 : * getting from char2wchar() is UTF16 not UTF32. A single input character
29 : * may therefore produce a surrogate pair rather than just one wchar_t;
30 : * we also need room for a trailing null. When we do get a surrogate pair,
31 : * we pass just the first code to iswdigit() etc, so that these functions will
32 : * always return false for characters outside the Basic Multilingual Plane.
33 : */
34 : #define WC_BUF_LEN 3
35 :
36 : int
37 CBC 10302 : t_isdigit(const char *ptr)
38 : {
39 10302 : int clen = pg_mblen(ptr);
40 : wchar_t character[WC_BUF_LEN];
41 10302 : pg_locale_t mylocale = 0; /* TODO */
42 :
43 10302 : if (clen == 1 || database_ctype_is_c)
44 10302 : return isdigit(TOUCHAR(ptr));
45 :
46 UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
47 :
48 0 : return iswdigit((wint_t) character[0]);
49 : }
50 :
51 : int
52 CBC 454400 : t_isspace(const char *ptr)
53 : {
54 454400 : int clen = pg_mblen(ptr);
55 : wchar_t character[WC_BUF_LEN];
56 454400 : pg_locale_t mylocale = 0; /* TODO */
57 :
58 454400 : if (clen == 1 || database_ctype_is_c)
59 454400 : return isspace(TOUCHAR(ptr));
60 :
61 UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
62 :
63 0 : return iswspace((wint_t) character[0]);
64 : }
65 :
66 : int
67 CBC 5142 : t_isalpha(const char *ptr)
68 : {
69 5142 : int clen = pg_mblen(ptr);
70 : wchar_t character[WC_BUF_LEN];
71 5142 : pg_locale_t mylocale = 0; /* TODO */
72 :
73 5142 : if (clen == 1 || database_ctype_is_c)
74 5142 : return isalpha(TOUCHAR(ptr));
75 :
76 UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
77 :
78 0 : return iswalpha((wint_t) character[0]);
79 : }
80 :
81 : int
82 GNC 1371889 : t_isalnum(const char *ptr)
83 : {
84 1371889 : int clen = pg_mblen(ptr);
85 : wchar_t character[WC_BUF_LEN];
86 1371889 : pg_locale_t mylocale = 0; /* TODO */
87 :
88 1371889 : if (clen == 1 || database_ctype_is_c)
89 1371889 : return isalnum(TOUCHAR(ptr));
90 :
91 UNC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
92 :
93 0 : return iswalnum((wint_t) character[0]);
94 : }
95 :
96 : int
97 CBC 2075 : t_isprint(const char *ptr)
98 : {
99 2075 : int clen = pg_mblen(ptr);
100 : wchar_t character[WC_BUF_LEN];
101 2075 : pg_locale_t mylocale = 0; /* TODO */
102 :
103 2075 : if (clen == 1 || database_ctype_is_c)
104 2075 : return isprint(TOUCHAR(ptr));
105 :
106 UBC 0 : char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
107 :
108 0 : return iswprint((wint_t) character[0]);
109 : }
110 :
111 :
112 ECB : /*
113 : * Set up to read a file using tsearch_readline(). This facility is
114 : * better than just reading the file directly because it provides error
115 : * context pointing to the specific line where a problem is detected.
116 : *
117 : * Expected usage is:
118 : *
119 : * tsearch_readline_state trst;
120 : *
121 EUB : * if (!tsearch_readline_begin(&trst, filename))
122 : * ereport(ERROR,
123 : * (errcode(ERRCODE_CONFIG_FILE_ERROR),
124 : * errmsg("could not open stop-word file \"%s\": %m",
125 : * filename)));
126 : * while ((line = tsearch_readline(&trst)) != NULL)
127 : * process line;
128 : * tsearch_readline_end(&trst);
129 : *
130 : * Note that the caller supplies the ereport() for file open failure;
131 : * this is so that a custom message can be provided. The filename string
132 : * passed to tsearch_readline_begin() must remain valid through
133 : * tsearch_readline_end().
134 : */
135 : bool
136 GIC 278 : tsearch_readline_begin(tsearch_readline_state *stp,
137 : const char *filename)
138 : {
139 278 : if ((stp->fp = AllocateFile(filename, "r")) == NULL)
140 UIC 0 : return false;
141 GIC 278 : stp->filename = filename;
142 278 : stp->lineno = 0;
143 278 : initStringInfo(&stp->buf);
144 278 : stp->curline = NULL;
145 : /* Setup error traceback support for ereport() */
146 278 : stp->cb.callback = tsearch_readline_callback;
147 278 : stp->cb.arg = (void *) stp;
148 278 : stp->cb.previous = error_context_stack;
149 278 : error_context_stack = &stp->cb;
150 278 : return true;
151 ECB : }
152 :
153 : /*
154 : * Read the next line from a tsearch data file (expected to be in UTF-8), and
155 EUB : * convert it to database encoding if needed. The returned string is palloc'd.
156 ECB : * NULL return means EOF.
157 : */
158 : char *
159 CBC 10711 : tsearch_readline(tsearch_readline_state *stp)
160 : {
161 ECB : char *recoded;
162 :
163 : /* Advance line number to use in error reports */
164 CBC 10711 : stp->lineno++;
165 ECB :
166 : /* Clear curline, it's no longer relevant */
167 GIC 10711 : if (stp->curline)
168 : {
169 10433 : if (stp->curline != stp->buf.data)
170 UIC 0 : pfree(stp->curline);
171 GIC 10433 : stp->curline = NULL;
172 : }
173 :
174 ECB : /* Collect next line, if there is one */
175 GIC 10711 : if (!pg_get_line_buf(stp->fp, &stp->buf))
176 235 : return NULL;
177 :
178 : /* Validate the input as UTF-8, then convert to DB encoding if needed */
179 CBC 10476 : recoded = pg_any_to_server(stp->buf.data, stp->buf.len, PG_UTF8);
180 :
181 : /* Save the correctly-encoded string for possible error reports */
182 10476 : stp->curline = recoded; /* might be equal to buf.data */
183 :
184 ECB : /*
185 EUB : * We always return a freshly pstrdup'd string. This is clearly necessary
186 ECB : * if pg_any_to_server() returned buf.data, and we need a second copy even
187 : * if encoding conversion did occur. The caller is entitled to pfree the
188 : * returned string at any time, which would leave curline pointing to
189 : * recycled storage, causing problems if an error occurs after that point.
190 : * (It's preferable to return the result of pstrdup instead of the output
191 : * of pg_any_to_server, because the conversion result tends to be
192 : * over-allocated. Since callers might save the result string directly
193 : * into a long-lived dictionary structure, we don't want it to be a larger
194 : * palloc chunk than necessary. We'll reclaim the conversion result on
195 : * the next call.)
196 : */
197 CBC 10476 : return pstrdup(recoded);
198 : }
199 :
200 : /*
201 : * Close down after reading a file with tsearch_readline()
202 : */
203 : void
204 GIC 278 : tsearch_readline_end(tsearch_readline_state *stp)
205 : {
206 : /* Suppress use of curline in any error reported below */
207 278 : if (stp->curline)
208 : {
209 43 : if (stp->curline != stp->buf.data)
210 UIC 0 : pfree(stp->curline);
211 GIC 43 : stp->curline = NULL;
212 ECB : }
213 :
214 : /* Release other resources */
215 GIC 278 : pfree(stp->buf.data);
216 278 : FreeFile(stp->fp);
217 :
218 : /* Pop the error context stack */
219 CBC 278 : error_context_stack = stp->cb.previous;
220 GIC 278 : }
221 :
222 ECB : /*
223 : * Error context callback for errors occurring while reading a tsearch
224 : * configuration file.
225 EUB : */
226 ECB : static void
227 UIC 0 : tsearch_readline_callback(void *arg)
228 : {
229 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg;
230 ECB :
231 : /*
232 : * We can't include the text of the config line for errors that occur
233 : * during tsearch_readline() itself. The major cause of such errors is
234 : * encoding violations, and we daren't try to print error messages
235 : * containing badly-encoded data.
236 : */
237 UIC 0 : if (stp->curline)
238 0 : errcontext("line %d of configuration file \"%s\": \"%s\"",
239 : stp->lineno,
240 : stp->filename,
241 : stp->curline);
242 EUB : else
243 UIC 0 : errcontext("line %d of configuration file \"%s\"",
244 EUB : stp->lineno,
245 : stp->filename);
246 UIC 0 : }
247 :
248 :
249 : /*
250 : * lowerstr --- fold null-terminated string to lower case
251 : *
252 EUB : * Returned string is palloc'd
253 : */
254 : char *
255 GIC 6922 : lowerstr(const char *str)
256 : {
257 6922 : return lowerstr_with_len(str, strlen(str));
258 EUB : }
259 :
260 : /*
261 : * lowerstr_with_len --- fold string to lower case
262 : *
263 : * Input string need not be null-terminated.
264 : *
265 : * Returned string is palloc'd
266 : */
267 : char *
268 GIC 140846 : lowerstr_with_len(const char *str, int len)
269 : {
270 ECB : char *out;
271 GIC 140846 : pg_locale_t mylocale = 0; /* TODO */
272 ECB :
273 GIC 140846 : if (len == 0)
274 UIC 0 : return pstrdup("");
275 :
276 : /*
277 : * Use wide char code only when max encoding length > 1 and ctype != C.
278 : * Some operating systems fail with multi-byte encodings and a C locale.
279 : * Also, for a C locale there is no need to process as multibyte. From
280 : * backend/utils/adt/oracle_compat.c Teodor
281 : */
282 GIC 140846 : if (pg_database_encoding_max_length() > 1 && !database_ctype_is_c)
283 CBC 135993 : {
284 : wchar_t *wstr,
285 : *wptr;
286 ECB : int wlen;
287 :
288 : /*
289 EUB : * alloc number of wchar_t for worst case, len contains number of
290 : * bytes >= number of characters and alloc 1 wchar_t for 0, because
291 : * wchar2char wants zero-terminated string
292 : */
293 GIC 135993 : wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
294 :
295 135993 : wlen = char2wchar(wstr, len + 1, str, len, mylocale);
296 135993 : Assert(wlen <= len);
297 ECB :
298 CBC 1140745 : while (*wptr)
299 : {
300 GIC 1004752 : *wptr = towlower((wint_t) *wptr);
301 1004752 : wptr++;
302 : }
303 :
304 : /*
305 : * Alloc result string for worst case + '\0'
306 : */
307 135993 : len = pg_database_encoding_max_length() * wlen + 1;
308 CBC 135993 : out = (char *) palloc(len);
309 :
310 135993 : wlen = wchar2char(out, wstr, len, mylocale);
311 ECB :
312 GIC 135993 : pfree(wstr);
313 ECB :
314 GIC 135993 : if (wlen < 0)
315 LBC 0 : ereport(ERROR,
316 ECB : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
317 : errmsg("conversion from wchar_t to server encoding failed: %m")));
318 GIC 135993 : Assert(wlen < len);
319 : }
320 : else
321 : {
322 CBC 4853 : const char *ptr = str;
323 ECB : char *outptr;
324 :
325 CBC 4853 : outptr = out = (char *) palloc(sizeof(char) * (len + 1));
326 GIC 28192 : while ((ptr - str) < len && *ptr)
327 ECB : {
328 GIC 23339 : *outptr++ = tolower(TOUCHAR(ptr));
329 CBC 23339 : ptr++;
330 EUB : }
331 GIC 4853 : *outptr = '\0';
332 : }
333 ECB :
334 GIC 140846 : return out;
335 : }
|