Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * regc_pg_locale.c
4 : : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : : * and functions to cache the results of wholesale ctype probing.
6 : : *
7 : : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : : *
9 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
10 : : * Portions Copyright (c) 1994, Regents of the University of California
11 : : *
12 : : * IDENTIFICATION
13 : : * src/backend/regex/regc_pg_locale.c
14 : : *
15 : : *-------------------------------------------------------------------------
16 : : */
17 : :
18 : : #include "catalog/pg_collation.h"
19 : : #include "common/unicode_case.h"
20 : : #include "common/unicode_category.h"
21 : : #include "utils/pg_locale.h"
22 : :
23 : : /*
24 : : * To provide as much functionality as possible on a variety of platforms,
25 : : * without going so far as to implement everything from scratch, we use
26 : : * several implementation strategies depending on the situation:
27 : : *
28 : : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
29 : : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
30 : : * collations don't give a fig about multibyte characters.
31 : : *
32 : : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
33 : : *
34 : : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions.
35 : : * This assumes that every platform uses Unicode codepoints directly
36 : : * as the wchar_t representation of Unicode. On some platforms
37 : : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
38 : : *
39 : : * 2b. In all other encodings, we use the <ctype.h> functions for pg_wchar
40 : : * values up to 255, and punt for values above that. This is 100% correct
41 : : * only in single-byte encodings such as LATINn. However, non-Unicode
42 : : * multibyte encodings are mostly Far Eastern character sets for which the
43 : : * properties being tested here aren't very relevant for higher code values
44 : : * anyway. The difficulty with using the <wctype.h> functions with
45 : : * non-Unicode multibyte encodings is that we can have no certainty that
46 : : * the platform's wchar_t representation matches what we do in pg_wchar
47 : : * conversions.
48 : : *
49 : : * 3. Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
50 : : * functions, under exactly the same cases as #2.
51 : : *
52 : : * There is one notable difference between cases 2 and 3: in the "default"
53 : : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
54 : : * while in a non-default collation we just let the library functions do what
55 : : * they will. The case where this matters is treatment of I/i in Turkish,
56 : : * and the behavior is meant to match the upper()/lower() SQL functions.
57 : : *
58 : : * We store the active collation setting in static variables. In principle
59 : : * it could be passed down to here via the regex library's "struct vars" data
60 : : * structure; but that would require somewhat invasive changes in the regex
61 : : * library, and right now there's no real benefit to be gained from that.
62 : : *
63 : : * NB: the coding here assumes pg_wchar is an unsigned type.
64 : : */
65 : :
66 : : typedef enum
67 : : {
68 : : PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
69 : : PG_REGEX_BUILTIN, /* built-in Unicode semantics */
70 : : PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
71 : : PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
72 : : PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
73 : : PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */
74 : : PG_REGEX_LOCALE_ICU, /* Use ICU uchar.h functions */
75 : : } PG_Locale_Strategy;
76 : :
77 : : static PG_Locale_Strategy pg_regex_strategy;
78 : : static pg_locale_t pg_regex_locale;
79 : : static Oid pg_regex_collation;
80 : :
81 : : /*
82 : : * Hard-wired character properties for C locale
83 : : */
84 : : #define PG_ISDIGIT 0x01
85 : : #define PG_ISALPHA 0x02
86 : : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
87 : : #define PG_ISUPPER 0x04
88 : : #define PG_ISLOWER 0x08
89 : : #define PG_ISGRAPH 0x10
90 : : #define PG_ISPRINT 0x20
91 : : #define PG_ISPUNCT 0x40
92 : : #define PG_ISSPACE 0x80
93 : :
94 : : static const unsigned char pg_char_properties[128] = {
95 : : /* NUL */ 0,
96 : : /* ^A */ 0,
97 : : /* ^B */ 0,
98 : : /* ^C */ 0,
99 : : /* ^D */ 0,
100 : : /* ^E */ 0,
101 : : /* ^F */ 0,
102 : : /* ^G */ 0,
103 : : /* ^H */ 0,
104 : : /* ^I */ PG_ISSPACE,
105 : : /* ^J */ PG_ISSPACE,
106 : : /* ^K */ PG_ISSPACE,
107 : : /* ^L */ PG_ISSPACE,
108 : : /* ^M */ PG_ISSPACE,
109 : : /* ^N */ 0,
110 : : /* ^O */ 0,
111 : : /* ^P */ 0,
112 : : /* ^Q */ 0,
113 : : /* ^R */ 0,
114 : : /* ^S */ 0,
115 : : /* ^T */ 0,
116 : : /* ^U */ 0,
117 : : /* ^V */ 0,
118 : : /* ^W */ 0,
119 : : /* ^X */ 0,
120 : : /* ^Y */ 0,
121 : : /* ^Z */ 0,
122 : : /* ^[ */ 0,
123 : : /* ^\ */ 0,
124 : : /* ^] */ 0,
125 : : /* ^^ */ 0,
126 : : /* ^_ */ 0,
127 : : /* */ PG_ISPRINT | PG_ISSPACE,
128 : : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
142 : : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
143 : : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
151 : : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
152 : : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
153 : : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
158 : : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
159 : : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
160 : : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
184 : : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
185 : : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
186 : : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
190 : : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
191 : : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
192 : : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
216 : : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
217 : : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
218 : : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
220 : : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
221 : : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
222 : : /* DEL */ 0
223 : : };
224 : :
225 : :
226 : : /*
227 : : * pg_set_regex_collation: set collation for these functions to obey
228 : : *
229 : : * This is called when beginning compilation or execution of a regexp.
230 : : * Since there's no need for reentrancy of regexp operations, it's okay
231 : : * to store the results in static variables.
232 : : */
233 : : void
4753 tgl@sss.pgh.pa.us 234 :CBC 961448 : pg_set_regex_collation(Oid collation)
235 : : {
815 peter@eisentraut.org 236 [ - + ]: 961448 : if (!OidIsValid(collation))
237 : : {
238 : : /*
239 : : * This typically means that the parser could not resolve a conflict
240 : : * of implicit collations, so report it that way.
241 : : */
815 peter@eisentraut.org 242 [ # # ]:UBC 0 : ereport(ERROR,
243 : : (errcode(ERRCODE_INDETERMINATE_COLLATION),
244 : : errmsg("could not determine which collation to use for regular expression"),
245 : : errhint("Use the COLLATE clause to set the collation explicitly.")));
246 : : }
247 : :
4753 tgl@sss.pgh.pa.us 248 [ + + ]:CBC 961448 : if (lc_ctype_is_c(collation))
249 : : {
250 : : /* C/POSIX collations use this path regardless of database encoding */
251 : 70746 : pg_regex_strategy = PG_REGEX_LOCALE_C;
252 : 70746 : pg_regex_locale = 0;
4438 253 : 70746 : pg_regex_collation = C_COLLATION_OID;
254 : : }
255 : : else
256 : : {
815 peter@eisentraut.org 257 : 890702 : pg_regex_locale = pg_newlocale_from_collation(collation);
258 : :
416 jdavis@postgresql.or 259 [ + + ]: 890702 : if (!pg_locale_deterministic(pg_regex_locale))
1850 peter@eisentraut.org 260 [ + - ]: 12 : ereport(ERROR,
261 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
262 : : errmsg("nondeterministic collations are not supported for regular expressions")));
263 : :
264 : : #ifdef USE_ICU
2579 peter_e@gmx.net 265 [ + + + + ]: 890690 : if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
266 : 471 : pg_regex_strategy = PG_REGEX_LOCALE_ICU;
267 : : else
268 : : #endif
4753 tgl@sss.pgh.pa.us 269 [ + + ]: 890219 : if (GetDatabaseEncoding() == PG_UTF8)
270 : : {
271 [ + + ]: 890217 : if (pg_regex_locale)
272 : : {
26 jdavis@postgresql.or 273 [ + - ]:GNC 95024 : if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
274 : 95024 : pg_regex_strategy = PG_REGEX_BUILTIN;
275 : : else
26 jdavis@postgresql.or 276 :UNC 0 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
277 : : }
278 : : else
4753 tgl@sss.pgh.pa.us 279 :CBC 795193 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
280 : : }
281 : : else
282 : : {
283 [ - + ]: 2 : if (pg_regex_locale)
4753 tgl@sss.pgh.pa.us 284 :UBC 0 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
285 : : else
4753 tgl@sss.pgh.pa.us 286 :CBC 2 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
287 : : }
288 : :
4438 289 : 890690 : pg_regex_collation = collation;
290 : : }
4753 291 : 961436 : }
292 : :
293 : : static int
294 : 72968 : pg_wc_isdigit(pg_wchar c)
295 : : {
296 [ + + + - : 72968 : switch (pg_regex_strategy)
- - + - ]
297 : : {
298 : 1071 : case PG_REGEX_LOCALE_C:
299 [ + - ]: 2142 : return (c <= (pg_wchar) 127 &&
300 [ + + ]: 1071 : (pg_char_properties[c] & PG_ISDIGIT));
26 jdavis@postgresql.or 301 :GNC 22583 : case PG_REGEX_BUILTIN:
302 : 22583 : return pg_u_isdigit(c, true);
4753 tgl@sss.pgh.pa.us 303 :CBC 43170 : case PG_REGEX_LOCALE_WIDE:
304 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
305 : 43170 : return iswdigit((wint_t) c);
306 : : /* FALL THRU */
307 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 308 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
309 [ # # ]: 0 : isdigit((unsigned char) c));
310 : 0 : case PG_REGEX_LOCALE_WIDE_L:
311 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 312 : 0 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
313 : : /* FALL THRU */
314 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 315 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 316 [ # # ]: 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
317 : : break;
2579 peter_e@gmx.net 318 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
319 : : #ifdef USE_ICU
320 : 6144 : return u_isdigit(c);
321 : : #endif
322 : : break;
323 : : }
4753 tgl@sss.pgh.pa.us 324 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
325 : : }
326 : :
327 : : static int
4753 tgl@sss.pgh.pa.us 328 :CBC 8330 : pg_wc_isalpha(pg_wchar c)
329 : : {
330 [ - + + - : 8330 : switch (pg_regex_strategy)
- - + - ]
331 : : {
4753 tgl@sss.pgh.pa.us 332 :UBC 0 : case PG_REGEX_LOCALE_C:
333 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
334 [ # # ]: 0 : (pg_char_properties[c] & PG_ISALPHA));
26 jdavis@postgresql.or 335 :GNC 11 : case PG_REGEX_BUILTIN:
336 : 11 : return pg_u_isalpha(c);
4753 tgl@sss.pgh.pa.us 337 :CBC 2175 : case PG_REGEX_LOCALE_WIDE:
338 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
339 : 2175 : return iswalpha((wint_t) c);
340 : : /* FALL THRU */
341 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 342 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
343 [ # # ]: 0 : isalpha((unsigned char) c));
344 : 0 : case PG_REGEX_LOCALE_WIDE_L:
345 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 346 : 0 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
347 : : /* FALL THRU */
348 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 349 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 350 [ # # ]: 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
351 : : break;
2579 peter_e@gmx.net 352 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
353 : : #ifdef USE_ICU
354 : 6144 : return u_isalpha(c);
355 : : #endif
356 : : break;
357 : : }
4753 tgl@sss.pgh.pa.us 358 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
359 : : }
360 : :
361 : : static int
4753 tgl@sss.pgh.pa.us 362 :CBC 33158 : pg_wc_isalnum(pg_wchar c)
363 : : {
364 [ + + + - : 33158 : switch (pg_regex_strategy)
- - + - ]
365 : : {
366 : 381 : case PG_REGEX_LOCALE_C:
367 [ + - ]: 762 : return (c <= (pg_wchar) 127 &&
368 [ + + ]: 381 : (pg_char_properties[c] & PG_ISALNUM));
26 jdavis@postgresql.or 369 :GNC 10238 : case PG_REGEX_BUILTIN:
370 : 10238 : return pg_u_isalnum(c, true);
4753 tgl@sss.pgh.pa.us 371 :CBC 16395 : case PG_REGEX_LOCALE_WIDE:
372 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
373 : 16395 : return iswalnum((wint_t) c);
374 : : /* FALL THRU */
375 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 376 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
377 [ # # ]: 0 : isalnum((unsigned char) c));
378 : 0 : case PG_REGEX_LOCALE_WIDE_L:
379 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 380 : 0 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
381 : : /* FALL THRU */
382 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 383 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 384 [ # # ]: 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
385 : : break;
2579 peter_e@gmx.net 386 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
387 : : #ifdef USE_ICU
388 : 6144 : return u_isalnum(c);
389 : : #endif
390 : : break;
391 : : }
4753 tgl@sss.pgh.pa.us 392 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
393 : : }
394 : :
395 : : static int
1144 tgl@sss.pgh.pa.us 396 :CBC 16769 : pg_wc_isword(pg_wchar c)
397 : : {
398 : : /* We define word characters as alnum class plus underscore */
399 [ + + ]: 16769 : if (c == CHR('_'))
400 : 11 : return 1;
401 : 16758 : return pg_wc_isalnum(c);
402 : : }
403 : :
404 : : static int
4753 405 : 14344 : pg_wc_isupper(pg_wchar c)
406 : : {
407 [ - + + - : 14344 : switch (pg_regex_strategy)
- - + - ]
408 : : {
4753 tgl@sss.pgh.pa.us 409 :UBC 0 : case PG_REGEX_LOCALE_C:
410 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
411 [ # # ]: 0 : (pg_char_properties[c] & PG_ISUPPER));
26 jdavis@postgresql.or 412 :GNC 6144 : case PG_REGEX_BUILTIN:
413 : 6144 : return pg_u_isupper(c);
4753 tgl@sss.pgh.pa.us 414 :CBC 2056 : case PG_REGEX_LOCALE_WIDE:
415 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
416 : 2056 : return iswupper((wint_t) c);
417 : : /* FALL THRU */
418 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 419 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
420 [ # # ]: 0 : isupper((unsigned char) c));
421 : 0 : case PG_REGEX_LOCALE_WIDE_L:
422 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 423 : 0 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
424 : : /* FALL THRU */
425 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 426 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 427 [ # # ]: 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
428 : : break;
2579 peter_e@gmx.net 429 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
430 : : #ifdef USE_ICU
431 : 6144 : return u_isupper(c);
432 : : #endif
433 : : break;
434 : : }
4753 tgl@sss.pgh.pa.us 435 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
436 : : }
437 : :
438 : : static int
4753 tgl@sss.pgh.pa.us 439 :CBC 8195 : pg_wc_islower(pg_wchar c)
440 : : {
441 [ - - + - : 8195 : switch (pg_regex_strategy)
- - + - ]
442 : : {
4753 tgl@sss.pgh.pa.us 443 :UBC 0 : case PG_REGEX_LOCALE_C:
444 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
445 [ # # ]: 0 : (pg_char_properties[c] & PG_ISLOWER));
26 jdavis@postgresql.or 446 :UNC 0 : case PG_REGEX_BUILTIN:
447 : 0 : return pg_u_islower(c);
4753 tgl@sss.pgh.pa.us 448 :CBC 2051 : case PG_REGEX_LOCALE_WIDE:
449 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
450 : 2051 : return iswlower((wint_t) c);
451 : : /* FALL THRU */
452 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 453 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
454 [ # # ]: 0 : islower((unsigned char) c));
455 : 0 : case PG_REGEX_LOCALE_WIDE_L:
456 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 457 : 0 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
458 : : /* FALL THRU */
459 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 460 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 461 [ # # ]: 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
462 : : break;
2579 peter_e@gmx.net 463 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
464 : : #ifdef USE_ICU
465 : 6144 : return u_islower(c);
466 : : #endif
467 : : break;
468 : : }
4753 tgl@sss.pgh.pa.us 469 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
470 : : }
471 : :
472 : : static int
4753 tgl@sss.pgh.pa.us 473 :CBC 8195 : pg_wc_isgraph(pg_wchar c)
474 : : {
475 [ - - + - : 8195 : switch (pg_regex_strategy)
- - + - ]
476 : : {
4753 tgl@sss.pgh.pa.us 477 :UBC 0 : case PG_REGEX_LOCALE_C:
478 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
479 [ # # ]: 0 : (pg_char_properties[c] & PG_ISGRAPH));
26 jdavis@postgresql.or 480 :UNC 0 : case PG_REGEX_BUILTIN:
481 : 0 : return pg_u_isgraph(c);
4753 tgl@sss.pgh.pa.us 482 :CBC 2051 : case PG_REGEX_LOCALE_WIDE:
483 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
484 : 2051 : return iswgraph((wint_t) c);
485 : : /* FALL THRU */
486 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 487 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
488 [ # # ]: 0 : isgraph((unsigned char) c));
489 : 0 : case PG_REGEX_LOCALE_WIDE_L:
490 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 491 : 0 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
492 : : /* FALL THRU */
493 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 494 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 495 [ # # ]: 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
496 : : break;
2579 peter_e@gmx.net 497 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
498 : : #ifdef USE_ICU
499 : 6144 : return u_isgraph(c);
500 : : #endif
501 : : break;
502 : : }
4753 tgl@sss.pgh.pa.us 503 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
504 : : }
505 : :
506 : : static int
4753 tgl@sss.pgh.pa.us 507 :CBC 8195 : pg_wc_isprint(pg_wchar c)
508 : : {
509 [ - - + - : 8195 : switch (pg_regex_strategy)
- - + - ]
510 : : {
4753 tgl@sss.pgh.pa.us 511 :UBC 0 : case PG_REGEX_LOCALE_C:
512 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
513 [ # # ]: 0 : (pg_char_properties[c] & PG_ISPRINT));
26 jdavis@postgresql.or 514 :UNC 0 : case PG_REGEX_BUILTIN:
515 : 0 : return pg_u_isprint(c);
4753 tgl@sss.pgh.pa.us 516 :CBC 2051 : case PG_REGEX_LOCALE_WIDE:
517 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
518 : 2051 : return iswprint((wint_t) c);
519 : : /* FALL THRU */
520 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 521 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
522 [ # # ]: 0 : isprint((unsigned char) c));
523 : 0 : case PG_REGEX_LOCALE_WIDE_L:
524 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 525 : 0 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
526 : : /* FALL THRU */
527 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 528 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 529 [ # # ]: 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
530 : : break;
2579 peter_e@gmx.net 531 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
532 : : #ifdef USE_ICU
533 : 6144 : return u_isprint(c);
534 : : #endif
535 : : break;
536 : : }
4753 tgl@sss.pgh.pa.us 537 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
538 : : }
539 : :
540 : : static int
4753 tgl@sss.pgh.pa.us 541 :CBC 14339 : pg_wc_ispunct(pg_wchar c)
542 : : {
543 [ - + + - : 14339 : switch (pg_regex_strategy)
- - + - ]
544 : : {
4753 tgl@sss.pgh.pa.us 545 :UBC 0 : case PG_REGEX_LOCALE_C:
546 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
547 [ # # ]: 0 : (pg_char_properties[c] & PG_ISPUNCT));
26 jdavis@postgresql.or 548 :GNC 6144 : case PG_REGEX_BUILTIN:
549 : 6144 : return pg_u_ispunct(c, true);
4753 tgl@sss.pgh.pa.us 550 :CBC 2051 : case PG_REGEX_LOCALE_WIDE:
551 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
552 : 2051 : return iswpunct((wint_t) c);
553 : : /* FALL THRU */
554 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 555 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
556 [ # # ]: 0 : ispunct((unsigned char) c));
557 : 0 : case PG_REGEX_LOCALE_WIDE_L:
558 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 559 : 0 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
560 : : /* FALL THRU */
561 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 562 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 563 [ # # ]: 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
564 : : break;
2579 peter_e@gmx.net 565 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
566 : : #ifdef USE_ICU
567 : 6144 : return u_ispunct(c);
568 : : #endif
569 : : break;
570 : : }
4753 tgl@sss.pgh.pa.us 571 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
572 : : }
573 : :
574 : : static int
4753 tgl@sss.pgh.pa.us 575 :CBC 38193 : pg_wc_isspace(pg_wchar c)
576 : : {
577 [ - + + - : 38193 : switch (pg_regex_strategy)
- - + - ]
578 : : {
4753 tgl@sss.pgh.pa.us 579 :UBC 0 : case PG_REGEX_LOCALE_C:
580 [ # # ]: 0 : return (c <= (pg_wchar) 127 &&
581 [ # # ]: 0 : (pg_char_properties[c] & PG_ISSPACE));
26 jdavis@postgresql.or 582 :GNC 8199 : case PG_REGEX_BUILTIN:
583 : 8199 : return pg_u_isspace(c);
4753 tgl@sss.pgh.pa.us 584 :CBC 23850 : case PG_REGEX_LOCALE_WIDE:
585 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
586 : 23850 : return iswspace((wint_t) c);
587 : : /* FALL THRU */
588 : : case PG_REGEX_LOCALE_1BYTE:
4753 tgl@sss.pgh.pa.us 589 [ # # ]:UBC 0 : return (c <= (pg_wchar) UCHAR_MAX &&
590 [ # # ]: 0 : isspace((unsigned char) c));
591 : 0 : case PG_REGEX_LOCALE_WIDE_L:
592 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 593 : 0 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
594 : : /* FALL THRU */
595 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 596 [ # # ]: 0 : return (c <= (pg_wchar) UCHAR_MAX &&
2579 peter_e@gmx.net 597 [ # # ]: 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
598 : : break;
2579 peter_e@gmx.net 599 :CBC 6144 : case PG_REGEX_LOCALE_ICU:
600 : : #ifdef USE_ICU
601 : 6144 : return u_isspace(c);
602 : : #endif
603 : : break;
604 : : }
4753 tgl@sss.pgh.pa.us 605 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
606 : : }
607 : :
608 : : static pg_wchar
4753 tgl@sss.pgh.pa.us 609 :CBC 5273 : pg_wc_toupper(pg_wchar c)
610 : : {
611 [ + + + - : 5273 : switch (pg_regex_strategy)
- - + - ]
612 : : {
613 : 528 : case PG_REGEX_LOCALE_C:
614 [ + - ]: 528 : if (c <= (pg_wchar) 127)
615 : 528 : return pg_ascii_toupper((unsigned char) c);
4753 tgl@sss.pgh.pa.us 616 :UBC 0 : return c;
26 jdavis@postgresql.or 617 :GNC 186 : case PG_REGEX_BUILTIN:
618 : 186 : return unicode_uppercase_simple(c);
4753 tgl@sss.pgh.pa.us 619 :CBC 4505 : case PG_REGEX_LOCALE_WIDE:
620 : : /* force C behavior for ASCII characters, per comments above */
621 [ + + ]: 4505 : if (c <= (pg_wchar) 127)
622 : 407 : return pg_ascii_toupper((unsigned char) c);
623 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
624 : 4098 : return towupper((wint_t) c);
625 : : /* FALL THRU */
626 : : case PG_REGEX_LOCALE_1BYTE:
627 : : /* force C behavior for ASCII characters, per comments above */
4753 tgl@sss.pgh.pa.us 628 [ # # ]:UBC 0 : if (c <= (pg_wchar) 127)
629 : 0 : return pg_ascii_toupper((unsigned char) c);
630 [ # # ]: 0 : if (c <= (pg_wchar) UCHAR_MAX)
631 : 0 : return toupper((unsigned char) c);
632 : 0 : return c;
633 : 0 : case PG_REGEX_LOCALE_WIDE_L:
634 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 635 : 0 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
636 : : /* FALL THRU */
637 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 638 [ # # ]: 0 : if (c <= (pg_wchar) UCHAR_MAX)
2579 peter_e@gmx.net 639 : 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
4753 tgl@sss.pgh.pa.us 640 : 0 : return c;
2579 peter_e@gmx.net 641 :CBC 54 : case PG_REGEX_LOCALE_ICU:
642 : : #ifdef USE_ICU
643 : 54 : return u_toupper(c);
644 : : #endif
645 : : break;
646 : : }
4753 tgl@sss.pgh.pa.us 647 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
648 : : }
649 : :
650 : : static pg_wchar
4753 tgl@sss.pgh.pa.us 651 :CBC 5275 : pg_wc_tolower(pg_wchar c)
652 : : {
653 [ + + + - : 5275 : switch (pg_regex_strategy)
- - + - ]
654 : : {
655 : 528 : case PG_REGEX_LOCALE_C:
656 [ + - ]: 528 : if (c <= (pg_wchar) 127)
657 : 528 : return pg_ascii_tolower((unsigned char) c);
4753 tgl@sss.pgh.pa.us 658 :UBC 0 : return c;
26 jdavis@postgresql.or 659 :GNC 186 : case PG_REGEX_BUILTIN:
660 : 186 : return unicode_lowercase_simple(c);
4753 tgl@sss.pgh.pa.us 661 :CBC 4507 : case PG_REGEX_LOCALE_WIDE:
662 : : /* force C behavior for ASCII characters, per comments above */
663 [ + + ]: 4507 : if (c <= (pg_wchar) 127)
664 : 409 : return pg_ascii_tolower((unsigned char) c);
665 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
666 : 4098 : return towlower((wint_t) c);
667 : : /* FALL THRU */
668 : : case PG_REGEX_LOCALE_1BYTE:
669 : : /* force C behavior for ASCII characters, per comments above */
4753 tgl@sss.pgh.pa.us 670 [ # # ]:UBC 0 : if (c <= (pg_wchar) 127)
671 : 0 : return pg_ascii_tolower((unsigned char) c);
672 [ # # ]: 0 : if (c <= (pg_wchar) UCHAR_MAX)
673 : 0 : return tolower((unsigned char) c);
674 : 0 : return c;
675 : 0 : case PG_REGEX_LOCALE_WIDE_L:
676 : : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
2579 peter_e@gmx.net 677 : 0 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
678 : : /* FALL THRU */
679 : : case PG_REGEX_LOCALE_1BYTE_L:
4753 tgl@sss.pgh.pa.us 680 [ # # ]: 0 : if (c <= (pg_wchar) UCHAR_MAX)
2579 peter_e@gmx.net 681 : 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
4753 tgl@sss.pgh.pa.us 682 : 0 : return c;
2579 peter_e@gmx.net 683 :CBC 54 : case PG_REGEX_LOCALE_ICU:
684 : : #ifdef USE_ICU
685 : 54 : return u_tolower(c);
686 : : #endif
687 : : break;
688 : : }
4753 tgl@sss.pgh.pa.us 689 :UBC 0 : return 0; /* can't get here, but keep compiler quiet */
690 : : }
691 : :
692 : :
693 : : /*
694 : : * These functions cache the results of probing libc's ctype behavior for
695 : : * all character codes of interest in a given encoding/collation. The
696 : : * result is provided as a "struct cvec", but notice that the representation
697 : : * is a touch different from a cvec created by regc_cvec.c: we allocate the
698 : : * chrs[] and ranges[] arrays separately from the struct so that we can
699 : : * realloc them larger at need. This is okay since the cvecs made here
700 : : * should never be freed by freecvec().
701 : : *
702 : : * We use malloc not palloc since we mustn't lose control on out-of-memory;
703 : : * the main regex code expects us to return a failure indication instead.
704 : : */
705 : :
706 : : typedef int (*pg_wc_probefunc) (pg_wchar c);
707 : :
708 : : typedef struct pg_ctype_cache
709 : : {
710 : : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
711 : : Oid collation; /* collation this entry is for */
712 : : struct cvec cv; /* cache entry contents */
713 : : struct pg_ctype_cache *next; /* chain link */
714 : : } pg_ctype_cache;
715 : :
716 : : static pg_ctype_cache *pg_ctype_cache_list = NULL;
717 : :
718 : : /*
719 : : * Add a chr or range to pcc->cv; return false if run out of memory
720 : : */
721 : : static bool
4438 tgl@sss.pgh.pa.us 722 :CBC 4480 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
723 : : {
724 : : chr *newchrs;
725 : :
726 [ + + ]: 4480 : if (nchrs > 1)
727 : : {
728 [ - + ]: 1361 : if (pcc->cv.nranges >= pcc->cv.rangespace)
729 : : {
4438 tgl@sss.pgh.pa.us 730 :UBC 0 : pcc->cv.rangespace *= 2;
731 : 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
732 : 0 : pcc->cv.rangespace * sizeof(chr) * 2);
733 [ # # ]: 0 : if (newchrs == NULL)
734 : 0 : return false;
735 : 0 : pcc->cv.ranges = newchrs;
736 : : }
4438 tgl@sss.pgh.pa.us 737 :CBC 1361 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
738 : 1361 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
739 : 1361 : pcc->cv.nranges++;
740 : : }
741 : : else
742 : : {
743 [ - + ]: 3119 : assert(nchrs == 1);
744 [ + + ]: 3119 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
745 : : {
746 : 11 : pcc->cv.chrspace *= 2;
747 : 11 : newchrs = (chr *) realloc(pcc->cv.chrs,
748 : 11 : pcc->cv.chrspace * sizeof(chr));
749 [ - + ]: 11 : if (newchrs == NULL)
4438 tgl@sss.pgh.pa.us 750 :UBC 0 : return false;
4438 tgl@sss.pgh.pa.us 751 :CBC 11 : pcc->cv.chrs = newchrs;
752 : : }
753 : 3119 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
754 : : }
755 : 4480 : return true;
756 : : }
757 : :
758 : : /*
759 : : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
760 : : * chrs satisfying the probe function. The active collation is the one
761 : : * previously set by pg_set_regex_collation. Return NULL if out of memory.
762 : : *
763 : : * Note that the result must not be freed or modified by caller.
764 : : */
765 : : static struct cvec *
2778 766 : 351 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
767 : : {
768 : : pg_ctype_cache *pcc;
769 : : pg_wchar max_chr;
770 : : pg_wchar cur_chr;
771 : : int nmatches;
772 : : chr *newchrs;
773 : :
774 : : /*
775 : : * Do we already have the answer cached?
776 : : */
4438 777 [ + + ]: 817 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
778 : : {
779 [ + + ]: 707 : if (pcc->probefunc == probefunc &&
780 [ + + ]: 265 : pcc->collation == pg_regex_collation)
781 : 241 : return &pcc->cv;
782 : : }
783 : :
784 : : /*
785 : : * Nope, so initialize some workspace ...
786 : : */
787 : 110 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
788 [ - + ]: 110 : if (pcc == NULL)
4438 tgl@sss.pgh.pa.us 789 :UBC 0 : return NULL;
4438 tgl@sss.pgh.pa.us 790 :CBC 110 : pcc->probefunc = probefunc;
791 : 110 : pcc->collation = pg_regex_collation;
792 : 110 : pcc->cv.nchrs = 0;
793 : 110 : pcc->cv.chrspace = 128;
794 : 110 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
795 : 110 : pcc->cv.nranges = 0;
796 : 110 : pcc->cv.rangespace = 64;
797 : 110 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
798 [ + - - + ]: 110 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
4438 tgl@sss.pgh.pa.us 799 :UBC 0 : goto out_of_memory;
2778 tgl@sss.pgh.pa.us 800 :CBC 110 : pcc->cv.cclasscode = cclasscode;
801 : :
802 : : /*
803 : : * Decide how many character codes we ought to look through. In general
804 : : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
805 : : * runtime using the "high colormap" mechanism. However, in C locale
806 : : * there's no need to go further than 127, and if we only have a 1-byte
807 : : * <ctype.h> API there's no need to go further than that can handle.
808 : : *
809 : : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
810 : : * output cvec as not having any locale-dependent behavior, since there
811 : : * will be no need to do any run-time locale checks. (The #if's here
812 : : * would always be true for production values of MAX_SIMPLE_CHR, but it's
813 : : * useful to allow it to be small for testing purposes.)
814 : : */
4438 815 [ + + + - : 110 : switch (pg_regex_strategy)
+ - ]
816 : : {
817 : 11 : case PG_REGEX_LOCALE_C:
818 : : #if MAX_SIMPLE_CHR >= 127
819 : 11 : max_chr = (pg_wchar) 127;
2778 820 : 11 : pcc->cv.cclasscode = -1;
821 : : #else
822 : : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
823 : : #endif
4438 824 : 11 : break;
26 jdavis@postgresql.or 825 :GNC 26 : case PG_REGEX_BUILTIN:
826 : 26 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
827 : 26 : break;
4438 tgl@sss.pgh.pa.us 828 :CBC 46 : case PG_REGEX_LOCALE_WIDE:
829 : : case PG_REGEX_LOCALE_WIDE_L:
2778 830 : 46 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
4438 831 : 46 : break;
4438 tgl@sss.pgh.pa.us 832 :UBC 0 : case PG_REGEX_LOCALE_1BYTE:
833 : : case PG_REGEX_LOCALE_1BYTE_L:
834 : : #if MAX_SIMPLE_CHR >= UCHAR_MAX
835 : 0 : max_chr = (pg_wchar) UCHAR_MAX;
2778 836 : 0 : pcc->cv.cclasscode = -1;
837 : : #else
838 : : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
839 : : #endif
4438 840 : 0 : break;
2579 peter_e@gmx.net 841 :CBC 27 : case PG_REGEX_LOCALE_ICU:
842 : 27 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
843 : 27 : break;
4438 tgl@sss.pgh.pa.us 844 :UBC 0 : default:
26 jdavis@postgresql.or 845 :UNC 0 : Assert(false);
4438 tgl@sss.pgh.pa.us 846 :EUB : max_chr = 0; /* can't get here, but keep compiler quiet */
847 : : break;
848 : : }
849 : :
850 : : /*
851 : : * And scan 'em ...
852 : : */
4438 tgl@sss.pgh.pa.us 853 :CBC 110 : nmatches = 0; /* number of consecutive matches */
854 : :
855 [ + + ]: 204270 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
856 : : {
857 [ + + ]: 204160 : if ((*probefunc) (cur_chr))
858 : 53351 : nmatches++;
859 [ + + ]: 150809 : else if (nmatches > 0)
860 : : {
861 [ - + ]: 4468 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
4438 tgl@sss.pgh.pa.us 862 :UBC 0 : goto out_of_memory;
4438 tgl@sss.pgh.pa.us 863 :CBC 4468 : nmatches = 0;
864 : : }
865 : : }
866 : :
867 [ + + ]: 110 : if (nmatches > 0)
868 [ - + ]: 12 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
4438 tgl@sss.pgh.pa.us 869 :UBC 0 : goto out_of_memory;
870 : :
871 : : /*
872 : : * We might have allocated more memory than needed, if so free it
873 : : */
4438 tgl@sss.pgh.pa.us 874 [ + + ]:CBC 110 : if (pcc->cv.nchrs == 0)
875 : : {
876 : 43 : free(pcc->cv.chrs);
877 : 43 : pcc->cv.chrs = NULL;
878 : 43 : pcc->cv.chrspace = 0;
879 : : }
880 [ + - ]: 67 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
881 : : {
882 : 67 : newchrs = (chr *) realloc(pcc->cv.chrs,
883 : 67 : pcc->cv.nchrs * sizeof(chr));
884 [ - + ]: 67 : if (newchrs == NULL)
4438 tgl@sss.pgh.pa.us 885 :UBC 0 : goto out_of_memory;
4438 tgl@sss.pgh.pa.us 886 :CBC 67 : pcc->cv.chrs = newchrs;
887 : 67 : pcc->cv.chrspace = pcc->cv.nchrs;
888 : : }
889 [ - + ]: 110 : if (pcc->cv.nranges == 0)
890 : : {
4438 tgl@sss.pgh.pa.us 891 :UBC 0 : free(pcc->cv.ranges);
892 : 0 : pcc->cv.ranges = NULL;
893 : 0 : pcc->cv.rangespace = 0;
894 : : }
4438 tgl@sss.pgh.pa.us 895 [ + - ]:CBC 110 : else if (pcc->cv.nranges < pcc->cv.rangespace)
896 : : {
897 : 110 : newchrs = (chr *) realloc(pcc->cv.ranges,
898 : 110 : pcc->cv.nranges * sizeof(chr) * 2);
899 [ - + ]: 110 : if (newchrs == NULL)
4438 tgl@sss.pgh.pa.us 900 :UBC 0 : goto out_of_memory;
4438 tgl@sss.pgh.pa.us 901 :CBC 110 : pcc->cv.ranges = newchrs;
902 : 110 : pcc->cv.rangespace = pcc->cv.nranges;
903 : : }
904 : :
905 : : /*
906 : : * Success, link it into cache chain
907 : : */
908 : 110 : pcc->next = pg_ctype_cache_list;
909 : 110 : pg_ctype_cache_list = pcc;
910 : :
911 : 110 : return &pcc->cv;
912 : :
913 : : /*
914 : : * Failure, clean up
915 : : */
4438 tgl@sss.pgh.pa.us 916 :UBC 0 : out_of_memory:
668 peter@eisentraut.org 917 : 0 : free(pcc->cv.chrs);
918 : 0 : free(pcc->cv.ranges);
4438 tgl@sss.pgh.pa.us 919 : 0 : free(pcc);
920 : :
921 : 0 : return NULL;
922 : : }
|