Age Owner TLA Line data Source code
1 : /*--------------------------------------------------------------------------
2 : *
3 : * test_regex.c
4 : * Test harness for the regular expression package.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : * IDENTIFICATION
10 : * src/test/modules/test_regex/test_regex.c
11 : *
12 : * -------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "funcapi.h"
18 : #include "miscadmin.h"
19 : #include "regex/regex.h"
20 : #include "utils/array.h"
21 : #include "utils/builtins.h"
22 :
823 tgl 23 CBC 2 : PG_MODULE_MAGIC;
24 :
25 :
26 : /* all the options of interest for regex functions */
27 : typedef struct test_re_flags
28 : {
29 : int cflags; /* compile flags for Spencer's regex code */
30 : int eflags; /* execute flags for Spencer's regex code */
31 : long info; /* expected re_info bits */
32 : bool glob; /* do it globally (for each occurrence) */
33 : bool indices; /* report indices not actual strings */
34 : bool partial; /* expect partial match */
35 : } test_re_flags;
36 :
37 : /* cross-call state for test_regex() */
38 : typedef struct test_regex_ctx
39 : {
40 : test_re_flags re_flags; /* flags */
41 : rm_detail_t details; /* "details" from execution */
42 : text *orig_str; /* data string in original TEXT form */
43 : int nmatches; /* number of places where pattern matched */
44 : int npatterns; /* number of capturing subpatterns */
45 : /* We store start char index and end+1 char index for each match */
46 : /* so the number of entries in match_locs is nmatches * npatterns * 2 */
47 : int *match_locs; /* 0-based character indexes */
48 : int next_match; /* 0-based index of next match to process */
49 : /* workspace for build_test_match_result() */
50 : Datum *elems; /* has npatterns+1 elements */
51 : bool *nulls; /* has npatterns+1 elements */
52 : pg_wchar *wide_str; /* wide-char version of original string */
53 : char *conv_buf; /* conversion buffer, if needed */
54 : int conv_bufsiz; /* size thereof */
55 : } test_regex_ctx;
56 :
57 : /* Local functions */
58 : static void test_re_compile(text *text_re, int cflags, Oid collation,
59 : regex_t *result_re);
60 : static void parse_test_flags(test_re_flags *flags, text *opts);
61 : static test_regex_ctx *setup_test_matches(text *orig_str,
62 : regex_t *cpattern,
63 : test_re_flags *re_flags,
64 : Oid collation,
65 : bool use_subpatterns);
66 : static ArrayType *build_test_info_result(regex_t *cpattern,
67 : test_re_flags *flags);
68 : static ArrayType *build_test_match_result(test_regex_ctx *matchctx);
69 :
70 :
71 : /*
72 : * test_regex(pattern text, string text, flags text) returns setof text[]
73 : *
74 : * This is largely based on regexp.c's regexp_matches, with additions
75 : * for debugging purposes.
76 : */
77 3 : PG_FUNCTION_INFO_V1(test_regex);
78 :
79 : Datum
80 1762 : test_regex(PG_FUNCTION_ARGS)
81 : {
82 : FuncCallContext *funcctx;
83 : test_regex_ctx *matchctx;
84 : ArrayType *result_ary;
85 :
86 1762 : if (SRF_IS_FIRSTCALL())
87 : {
88 694 : text *pattern = PG_GETARG_TEXT_PP(0);
89 694 : text *flags = PG_GETARG_TEXT_PP(2);
90 694 : Oid collation = PG_GET_COLLATION();
91 : test_re_flags re_flags;
92 : regex_t cpattern;
93 : MemoryContext oldcontext;
94 :
95 694 : funcctx = SRF_FIRSTCALL_INIT();
96 694 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
97 :
98 : /* Determine options */
99 694 : parse_test_flags(&re_flags, flags);
100 :
101 : /* set up the compiled pattern */
102 694 : test_re_compile(pattern, re_flags.cflags, collation, &cpattern);
103 :
104 : /* be sure to copy the input string into the multi-call ctx */
105 588 : matchctx = setup_test_matches(PG_GETARG_TEXT_P_COPY(1), &cpattern,
106 : &re_flags,
107 : collation,
108 : true);
109 :
110 : /* Pre-create workspace that build_test_match_result needs */
111 1176 : matchctx->elems = (Datum *) palloc(sizeof(Datum) *
112 588 : (matchctx->npatterns + 1));
113 1176 : matchctx->nulls = (bool *) palloc(sizeof(bool) *
114 588 : (matchctx->npatterns + 1));
115 :
116 588 : MemoryContextSwitchTo(oldcontext);
117 588 : funcctx->user_fctx = (void *) matchctx;
118 :
119 : /*
120 : * Return the first result row, which is info equivalent to Tcl's
121 : * "regexp -about" output
122 : */
123 588 : result_ary = build_test_info_result(&cpattern, &re_flags);
124 :
125 588 : pg_regfree(&cpattern);
126 :
127 588 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
128 : }
129 : else
130 : {
131 : /* Each subsequent row describes one match */
132 1068 : funcctx = SRF_PERCALL_SETUP();
133 1068 : matchctx = (test_regex_ctx *) funcctx->user_fctx;
134 :
135 1068 : if (matchctx->next_match < matchctx->nmatches)
136 : {
137 480 : result_ary = build_test_match_result(matchctx);
138 480 : matchctx->next_match++;
139 480 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
140 : }
141 : }
142 :
143 588 : SRF_RETURN_DONE(funcctx);
144 : }
145 :
146 :
147 : /*
148 : * test_re_compile - compile a RE
149 : *
150 : * text_re --- the pattern, expressed as a TEXT object
151 : * cflags --- compile options for the pattern
152 : * collation --- collation to use for LC_CTYPE-dependent behavior
153 : * result_re --- output, compiled RE is stored here
154 : *
155 : * Pattern is given in the database encoding. We internally convert to
156 : * an array of pg_wchar, which is what Spencer's regex package wants.
157 : *
158 : * Caller must eventually pg_regfree the resulting RE to avoid memory leaks.
159 : */
160 : static void
161 694 : test_re_compile(text *text_re, int cflags, Oid collation,
162 : regex_t *result_re)
163 : {
164 694 : int text_re_len = VARSIZE_ANY_EXHDR(text_re);
165 694 : char *text_re_val = VARDATA_ANY(text_re);
166 : pg_wchar *pattern;
167 : int pattern_len;
168 : int regcomp_result;
169 : char errMsg[100];
170 :
171 : /* Convert pattern string to wide characters */
172 694 : pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
173 694 : pattern_len = pg_mb2wchar_with_len(text_re_val,
174 : pattern,
175 : text_re_len);
176 :
177 694 : regcomp_result = pg_regcomp(result_re,
178 : pattern,
179 : pattern_len,
180 : cflags,
181 : collation);
182 :
183 694 : pfree(pattern);
184 :
185 694 : if (regcomp_result != REG_OKAY)
186 : {
187 : /* re didn't compile (no need for pg_regfree, if so) */
823 tgl 188 GIC 106 : pg_regerror(regcomp_result, result_re, errMsg, sizeof(errMsg));
189 106 : ereport(ERROR,
190 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
191 : errmsg("invalid regular expression: %s", errMsg)));
192 : }
823 tgl 193 CBC 588 : }
194 :
195 : /*
196 : * test_re_execute - execute a RE on pg_wchar data
197 : *
198 : * Returns true on match, false on no match
199 : * Arguments are as for pg_regexec
200 : */
201 : static bool
823 tgl 202 GIC 588 : test_re_execute(regex_t *re, pg_wchar *data, int data_len,
823 tgl 203 ECB : int start_search,
204 : rm_detail_t *details,
205 : int nmatch, regmatch_t *pmatch,
206 : int eflags)
207 : {
208 : int regexec_result;
209 : char errMsg[100];
210 :
211 : /* Initialize match locations in case engine doesn't */
823 tgl 212 CBC 588 : details->rm_extend.rm_so = -1;
823 tgl 213 GIC 588 : details->rm_extend.rm_eo = -1;
214 1462 : for (int i = 0; i < nmatch; i++)
215 : {
216 874 : pmatch[i].rm_so = -1;
217 874 : pmatch[i].rm_eo = -1;
218 : }
219 :
220 : /* Perform RE match and return result */
823 tgl 221 CBC 588 : regexec_result = pg_regexec(re,
222 : data,
223 : data_len,
823 tgl 224 EUB : start_search,
225 : details,
226 : nmatch,
227 : pmatch,
228 : eflags);
229 :
823 tgl 230 CBC 588 : if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
231 : {
232 : /* re failed??? */
823 tgl 233 UIC 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
234 0 : ereport(ERROR,
235 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
236 : errmsg("regular expression failed: %s", errMsg)));
237 : }
238 :
823 tgl 239 GIC 588 : return (regexec_result == REG_OKAY);
823 tgl 240 ECB : }
241 :
242 :
243 : /*
244 : * parse_test_flags - parse the flags argument
245 : *
246 : * flags --- output argument, filled with desired options
247 : * opts --- TEXT object, or NULL for defaults
248 : */
249 : static void
823 tgl 250 GIC 694 : parse_test_flags(test_re_flags *flags, text *opts)
823 tgl 251 ECB : {
252 : /* these defaults must match Tcl's */
823 tgl 253 CBC 694 : int cflags = REG_ADVANCED;
254 694 : int eflags = 0;
823 tgl 255 GIC 694 : long info = 0;
256 :
823 tgl 257 CBC 694 : flags->glob = false;
823 tgl 258 GIC 694 : flags->indices = false;
823 tgl 259 CBC 694 : flags->partial = false;
260 :
261 694 : if (opts)
262 : {
263 694 : char *opt_p = VARDATA_ANY(opts);
264 694 : int opt_len = VARSIZE_ANY_EXHDR(opts);
823 tgl 265 ECB : int i;
266 :
823 tgl 267 CBC 1880 : for (i = 0; i < opt_len; i++)
268 : {
269 1186 : switch (opt_p[i])
823 tgl 270 ECB : {
823 tgl 271 CBC 78 : case '-':
823 tgl 272 ECB : /* allowed, no-op */
823 tgl 273 GIC 78 : break;
274 7 : case '!':
823 tgl 275 GBC 7 : flags->partial = true;
276 7 : break;
277 1 : case '*':
823 tgl 278 ECB : /* test requires Unicode --- ignored here */
823 tgl 279 CBC 1 : break;
280 53 : case '0':
281 53 : flags->indices = true;
282 53 : break;
823 tgl 283 ECB :
284 : /* These flags correspond to user-exposed RE options: */
823 tgl 285 LBC 0 : case 'g': /* global match */
286 0 : flags->glob = true;
287 0 : break;
823 tgl 288 CBC 20 : case 'i': /* case insensitive */
289 20 : cflags |= REG_ICASE;
290 20 : break;
291 35 : case 'n': /* \n affects ^ $ . [^ */
292 35 : cflags |= REG_NEWLINE;
293 35 : break;
294 2 : case 'p': /* ~Perl, \n affects . [^ */
823 tgl 295 GIC 2 : cflags |= REG_NLSTOP;
296 2 : cflags &= ~REG_NLANCH;
823 tgl 297 CBC 2 : break;
298 2 : case 'w': /* weird, \n affects ^ $ only */
299 2 : cflags &= ~REG_NLSTOP;
300 2 : cflags |= REG_NLANCH;
301 2 : break;
302 14 : case 'x': /* expanded syntax */
303 14 : cflags |= REG_EXPANDED;
823 tgl 304 GIC 14 : break;
305 :
306 : /* These flags correspond to Tcl's -xflags options: */
307 2 : case 'a':
308 2 : cflags |= REG_ADVF;
309 2 : break;
310 131 : case 'b':
311 131 : cflags &= ~REG_ADVANCED;
312 131 : break;
823 tgl 313 CBC 11 : case 'c':
823 tgl 314 ECB :
315 : /*
316 : * Tcl calls this TCL_REG_CANMATCH, but it's really
317 : * REG_EXPECT. In this implementation we must also set
318 : * the partial and indices flags, so that
319 : * setup_test_matches and build_test_match_result will
320 : * emit the desired data. (They'll emit more fields than
321 : * Tcl would, but that's fine.)
322 : */
823 tgl 323 CBC 11 : cflags |= REG_EXPECT;
324 11 : flags->partial = true;
325 11 : flags->indices = true;
326 11 : break;
327 10 : case 'e':
328 10 : cflags &= ~REG_ADVANCED;
329 10 : cflags |= REG_EXTENDED;
330 10 : break;
331 6 : case 'q':
332 6 : cflags &= ~REG_ADVANCED;
333 6 : cflags |= REG_QUOTE;
823 tgl 334 GBC 6 : break;
335 2 : case 'o': /* o for opaque */
336 2 : cflags |= REG_NOSUB;
337 2 : break;
338 2 : case 's': /* s for start */
339 2 : cflags |= REG_BOSONLY;
340 2 : break;
341 6 : case '+':
342 6 : cflags |= REG_FAKE;
343 6 : break;
823 tgl 344 UBC 0 : case ',':
345 0 : cflags |= REG_PROGRESS;
823 tgl 346 LBC 0 : break;
347 0 : case '.':
348 0 : cflags |= REG_DUMP;
349 0 : break;
350 0 : case ':':
351 0 : eflags |= REG_MTRACE;
352 0 : break;
353 0 : case ';':
354 0 : eflags |= REG_FTRACE;
355 0 : break;
823 tgl 356 CBC 6 : case '^':
357 6 : eflags |= REG_NOTBOL;
823 tgl 358 GIC 6 : break;
359 4 : case '$':
823 tgl 360 CBC 4 : eflags |= REG_NOTEOL;
361 4 : break;
362 17 : case 't':
363 17 : cflags |= REG_EXPECT;
364 17 : break;
365 5 : case '%':
366 5 : eflags |= REG_SMALL;
367 5 : break;
823 tgl 368 ECB :
369 : /* These flags define expected info bits: */
823 tgl 370 CBC 5 : case 'A':
371 5 : info |= REG_UBSALNUM;
372 5 : break;
373 4 : case 'B':
374 4 : info |= REG_UBRACES;
375 4 : break;
376 40 : case 'E':
377 40 : info |= REG_UBBS;
378 40 : break;
379 34 : case 'H':
380 34 : info |= REG_ULOOKAROUND;
381 34 : break;
382 10 : case 'I':
383 10 : info |= REG_UIMPOSSIBLE;
384 10 : break;
385 162 : case 'L':
386 162 : info |= REG_ULOCALE;
387 162 : break;
388 43 : case 'M':
389 43 : info |= REG_UUNPORT;
390 43 : break;
391 46 : case 'N':
392 46 : info |= REG_UEMPTYMATCH;
393 46 : break;
394 305 : case 'P':
395 305 : info |= REG_UNONPOSIX;
396 305 : break;
397 35 : case 'Q':
398 35 : info |= REG_UBOUNDS;
399 35 : break;
400 42 : case 'R':
401 42 : info |= REG_UBACKREF;
823 tgl 402 GIC 42 : break;
823 tgl 403 GBC 25 : case 'S':
404 25 : info |= REG_UUNSPEC;
823 tgl 405 GIC 25 : break;
406 20 : case 'T':
407 20 : info |= REG_USHORTEST;
408 20 : break;
409 1 : case 'U':
410 1 : info |= REG_UPBOTCH;
411 1 : break;
823 tgl 412 ECB :
823 tgl 413 LBC 0 : default:
414 0 : ereport(ERROR,
823 tgl 415 ECB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
416 : errmsg("invalid regular expression test option: \"%.*s\"",
417 : pg_mblen(opt_p + i), opt_p + i)));
418 : break;
419 : }
420 : }
421 : }
823 tgl 422 GIC 694 : flags->cflags = cflags;
423 694 : flags->eflags = eflags;
424 694 : flags->info = info;
823 tgl 425 CBC 694 : }
426 :
427 : /*
428 : * setup_test_matches --- do the initial matching
429 : *
823 tgl 430 ECB : * To simplify memory management, we do all the matching in one swoop.
431 : * The returned test_regex_ctx contains the locations of all the substrings
432 : * matching the pattern.
433 : */
434 : static test_regex_ctx *
823 tgl 435 GIC 588 : setup_test_matches(text *orig_str,
436 : regex_t *cpattern, test_re_flags *re_flags,
437 : Oid collation,
438 : bool use_subpatterns)
439 : {
440 588 : test_regex_ctx *matchctx = palloc0(sizeof(test_regex_ctx));
823 tgl 441 CBC 588 : int eml = pg_database_encoding_max_length();
442 : int orig_len;
443 : pg_wchar *wide_str;
823 tgl 444 ECB : int wide_len;
445 : regmatch_t *pmatch;
446 : int pmatch_len;
447 : int array_len;
448 : int array_idx;
449 : int prev_match_end;
450 : int start_search;
823 tgl 451 CBC 588 : int maxlen = 0; /* largest fetch length in characters */
823 tgl 452 ECB :
453 : /* save flags */
823 tgl 454 GIC 588 : matchctx->re_flags = *re_flags;
823 tgl 455 ECB :
456 : /* save original string --- we'll extract result substrings from it */
823 tgl 457 CBC 588 : matchctx->orig_str = orig_str;
823 tgl 458 ECB :
459 : /* convert string to pg_wchar form for matching */
823 tgl 460 GIC 588 : orig_len = VARSIZE_ANY_EXHDR(orig_str);
461 588 : wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
823 tgl 462 CBC 588 : wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
823 tgl 463 ECB :
464 : /* do we want to remember subpatterns? */
823 tgl 465 GIC 588 : if (use_subpatterns && cpattern->re_nsub > 0)
466 : {
467 127 : matchctx->npatterns = cpattern->re_nsub + 1;
823 tgl 468 CBC 127 : pmatch_len = cpattern->re_nsub + 1;
469 : }
470 : else
471 : {
823 tgl 472 GIC 461 : use_subpatterns = false;
473 461 : matchctx->npatterns = 1;
474 461 : pmatch_len = 1;
475 : }
823 tgl 476 ECB :
477 : /* temporary output space for RE package */
823 tgl 478 CBC 588 : pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
479 :
480 : /*
823 tgl 481 ECB : * the real output space (grown dynamically if needed)
482 : *
483 : * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
484 : * than at 2^27
485 : */
823 tgl 486 GIC 588 : array_len = re_flags->glob ? 255 : 31;
487 588 : matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
488 588 : array_idx = 0;
489 :
823 tgl 490 ECB : /* search for the pattern, perhaps repeatedly */
823 tgl 491 GIC 588 : prev_match_end = 0;
823 tgl 492 GBC 588 : start_search = 0;
493 588 : while (test_re_execute(cpattern, wide_str, wide_len,
823 tgl 494 EUB : start_search,
495 : &matchctx->details,
496 : pmatch_len, pmatch,
497 : re_flags->eflags))
498 : {
499 : /* enlarge output space if needed */
823 tgl 500 GIC 462 : while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
501 : {
823 tgl 502 LBC 0 : array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
823 tgl 503 UIC 0 : if (array_len > MaxAllocSize / sizeof(int))
823 tgl 504 LBC 0 : ereport(ERROR,
823 tgl 505 ECB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
506 : errmsg("too many regular expression matches")));
823 tgl 507 LBC 0 : matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
823 tgl 508 ECB : sizeof(int) * array_len);
509 : }
510 :
511 : /* save this match's locations */
823 tgl 512 CBC 1092 : for (int i = 0; i < matchctx->npatterns; i++)
823 tgl 513 ECB : {
823 tgl 514 GIC 630 : int so = pmatch[i].rm_so;
515 630 : int eo = pmatch[i].rm_eo;
823 tgl 516 ECB :
823 tgl 517 CBC 630 : matchctx->match_locs[array_idx++] = so;
823 tgl 518 GIC 630 : matchctx->match_locs[array_idx++] = eo;
519 630 : if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
520 438 : maxlen = (eo - so);
521 : }
522 462 : matchctx->nmatches++;
523 462 : prev_match_end = pmatch[0].rm_eo;
524 :
823 tgl 525 EUB : /* if not glob, stop after one match */
823 tgl 526 GBC 462 : if (!re_flags->glob)
527 462 : break;
823 tgl 528 EUB :
529 : /*
530 : * Advance search position. Normally we start the next search at the
531 : * end of the previous match; but if the match was of zero length, we
532 : * have to advance by one character, or we'd just find the same match
533 : * again.
534 : */
823 tgl 535 UIC 0 : start_search = prev_match_end;
823 tgl 536 LBC 0 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
823 tgl 537 UIC 0 : start_search++;
538 0 : if (start_search > wide_len)
823 tgl 539 LBC 0 : break;
540 : }
823 tgl 541 EUB :
542 : /*
543 : * If we had no match, but "partial" and "indices" are set, emit the
544 : * details.
545 : */
823 tgl 546 GBC 588 : if (matchctx->nmatches == 0 && re_flags->partial && re_flags->indices)
547 : {
548 : /* enlarge output space if needed */
812 tgl 549 GIC 18 : while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
812 tgl 550 ECB : {
812 tgl 551 LBC 0 : array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
812 tgl 552 UIC 0 : if (array_len > MaxAllocSize / sizeof(int))
812 tgl 553 LBC 0 : ereport(ERROR,
554 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
812 tgl 555 ECB : errmsg("too many regular expression matches")));
812 tgl 556 LBC 0 : matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
557 : sizeof(int) * array_len);
812 tgl 558 ECB : }
559 :
823 tgl 560 GIC 18 : matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_so;
823 tgl 561 CBC 18 : matchctx->match_locs[array_idx++] = matchctx->details.rm_extend.rm_eo;
562 : /* we don't have pmatch data, so emit -1 */
563 20 : for (int i = 1; i < matchctx->npatterns; i++)
564 : {
565 2 : matchctx->match_locs[array_idx++] = -1;
823 tgl 566 GIC 2 : matchctx->match_locs[array_idx++] = -1;
567 : }
568 18 : matchctx->nmatches++;
569 : }
570 :
812 571 588 : Assert(array_idx <= array_len);
572 :
823 573 588 : if (eml > 1)
574 : {
575 588 : int64 maxsiz = eml * (int64) maxlen;
576 : int conv_bufsiz;
577 :
823 tgl 578 ECB : /*
579 : * Make the conversion buffer large enough for any substring of
580 : * interest.
581 : *
582 : * Worst case: assume we need the maximum size (maxlen*eml), but take
583 : * advantage of the fact that the original string length in bytes is
584 : * an upper bound on the byte length of any fetched substring (and we
585 : * know that len+1 is safe to allocate because the varlena header is
586 : * longer than 1 byte).
587 : */
823 tgl 588 GIC 588 : if (maxsiz > orig_len)
589 415 : conv_bufsiz = orig_len + 1;
823 tgl 590 EUB : else
823 tgl 591 GBC 173 : conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */
823 tgl 592 EUB :
823 tgl 593 GBC 588 : matchctx->conv_buf = palloc(conv_bufsiz);
823 tgl 594 GIC 588 : matchctx->conv_bufsiz = conv_bufsiz;
595 588 : matchctx->wide_str = wide_str;
596 : }
823 tgl 597 ECB : else
598 : {
599 : /* No need to keep the wide string if we're in a single-byte charset. */
823 tgl 600 UIC 0 : pfree(wide_str);
601 0 : matchctx->wide_str = NULL;
602 0 : matchctx->conv_buf = NULL;
603 0 : matchctx->conv_bufsiz = 0;
604 : }
605 :
606 : /* Clean up temp storage */
823 tgl 607 GIC 588 : pfree(pmatch);
823 tgl 608 ECB :
823 tgl 609 GIC 588 : return matchctx;
610 : }
611 :
612 : /*
613 : * build_test_info_result - build output array describing compiled regexp
614 : *
615 : * This borrows some code from Tcl's TclRegAbout().
616 : */
617 : static ArrayType *
618 588 : build_test_info_result(regex_t *cpattern, test_re_flags *flags)
619 : {
620 : /* Translation data for flag bits in regex_t.re_info */
621 : struct infoname
622 : {
623 : int bit;
624 : const char *text;
625 : };
626 : static const struct infoname infonames[] = {
627 : {REG_UBACKREF, "REG_UBACKREF"},
628 : {REG_ULOOKAROUND, "REG_ULOOKAROUND"},
629 : {REG_UBOUNDS, "REG_UBOUNDS"},
630 : {REG_UBRACES, "REG_UBRACES"},
631 : {REG_UBSALNUM, "REG_UBSALNUM"},
632 : {REG_UPBOTCH, "REG_UPBOTCH"},
633 : {REG_UBBS, "REG_UBBS"},
634 : {REG_UNONPOSIX, "REG_UNONPOSIX"},
823 tgl 635 ECB : {REG_UUNSPEC, "REG_UUNSPEC"},
636 : {REG_UUNPORT, "REG_UUNPORT"},
637 : {REG_ULOCALE, "REG_ULOCALE"},
638 : {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"},
639 : {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"},
640 : {REG_USHORTEST, "REG_USHORTEST"},
641 : {0, NULL}
642 : };
643 : const struct infoname *inf;
644 : Datum elems[lengthof(infonames) + 1];
823 tgl 645 CBC 588 : int nresults = 0;
646 : char buf[80];
823 tgl 647 ECB : int dims[1];
648 : int lbs[1];
649 :
650 : /* Set up results: first, the number of subexpressions */
823 tgl 651 GIC 588 : snprintf(buf, sizeof(buf), "%d", (int) cpattern->re_nsub);
652 588 : elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
823 tgl 653 EUB :
654 : /* Report individual info bit states */
823 tgl 655 GIC 8820 : for (inf = infonames; inf->bit != 0; inf++)
656 : {
657 8232 : if (cpattern->re_info & inf->bit)
658 : {
823 tgl 659 CBC 749 : if (flags->info & inf->bit)
823 tgl 660 GIC 749 : elems[nresults++] = PointerGetDatum(cstring_to_text(inf->text));
823 tgl 661 EUB : else
662 : {
823 tgl 663 UIC 0 : snprintf(buf, sizeof(buf), "unexpected %s!", inf->text);
664 0 : elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
665 : }
666 : }
667 : else
823 tgl 668 ECB : {
823 tgl 669 CBC 7483 : if (flags->info & inf->bit)
670 : {
823 tgl 671 LBC 0 : snprintf(buf, sizeof(buf), "missing %s!", inf->text);
823 tgl 672 UIC 0 : elems[nresults++] = PointerGetDatum(cstring_to_text(buf));
673 : }
674 : }
675 : }
676 :
677 : /* And form an array */
823 tgl 678 GIC 588 : dims[0] = nresults;
679 588 : lbs[0] = 1;
680 : /* XXX: this hardcodes assumptions about the text type */
681 588 : return construct_md_array(elems, NULL, 1, dims, lbs,
823 tgl 682 ECB : TEXTOID, -1, false, TYPALIGN_INT);
683 : }
684 :
685 : /*
686 : * build_test_match_result - build output array for current match
687 : *
688 : * Note that if the indices flag is set, we don't need any strings,
689 : * just the location data.
690 : */
691 : static ArrayType *
823 tgl 692 GIC 480 : build_test_match_result(test_regex_ctx *matchctx)
693 : {
694 480 : char *buf = matchctx->conv_buf;
823 tgl 695 CBC 480 : Datum *elems = matchctx->elems;
696 480 : bool *nulls = matchctx->nulls;
823 tgl 697 GIC 480 : bool indices = matchctx->re_flags.indices;
823 tgl 698 ECB : char bufstr[80];
699 : int dims[1];
700 : int lbs[1];
701 : int loc;
702 : int i;
703 :
704 : /* Extract matching substrings from the original string */
823 tgl 705 GIC 480 : loc = matchctx->next_match * matchctx->npatterns * 2;
823 tgl 706 CBC 1130 : for (i = 0; i < matchctx->npatterns; i++)
823 tgl 707 ECB : {
823 tgl 708 GIC 650 : int so = matchctx->match_locs[loc++];
823 tgl 709 CBC 650 : int eo = matchctx->match_locs[loc++];
710 :
711 650 : if (indices)
823 tgl 712 ECB : {
713 : /* Report eo this way for consistency with Tcl */
823 tgl 714 CBC 84 : snprintf(bufstr, sizeof(bufstr), "%d %d",
715 : so, so < 0 ? eo : eo - 1);
716 84 : elems[i] = PointerGetDatum(cstring_to_text(bufstr));
823 tgl 717 GIC 84 : nulls[i] = false;
718 : }
719 566 : else if (so < 0 || eo < 0)
823 tgl 720 ECB : {
823 tgl 721 CBC 12 : elems[i] = (Datum) 0;
722 12 : nulls[i] = true;
723 : }
823 tgl 724 GIC 554 : else if (buf)
725 : {
823 tgl 726 GBC 554 : int len = pg_wchar2mb_with_len(matchctx->wide_str + so,
727 : buf,
728 : eo - so);
729 :
730 554 : Assert(len < matchctx->conv_bufsiz);
823 tgl 731 GIC 554 : elems[i] = PointerGetDatum(cstring_to_text_with_len(buf, len));
732 554 : nulls[i] = false;
733 : }
734 : else
823 tgl 735 ECB : {
823 tgl 736 UIC 0 : elems[i] = DirectFunctionCall3(text_substr,
823 tgl 737 ECB : PointerGetDatum(matchctx->orig_str),
738 : Int32GetDatum(so + 1),
739 : Int32GetDatum(eo - so));
823 tgl 740 LBC 0 : nulls[i] = false;
741 : }
823 tgl 742 ECB : }
743 :
744 : /* In EXPECT indices mode, also report the "details" */
823 tgl 745 GIC 480 : if (indices && (matchctx->re_flags.cflags & REG_EXPECT))
746 : {
747 28 : int so = matchctx->details.rm_extend.rm_so;
823 tgl 748 CBC 28 : int eo = matchctx->details.rm_extend.rm_eo;
823 tgl 749 ECB :
823 tgl 750 GIC 28 : snprintf(bufstr, sizeof(bufstr), "%d %d",
823 tgl 751 ECB : so, so < 0 ? eo : eo - 1);
823 tgl 752 GIC 28 : elems[i] = PointerGetDatum(cstring_to_text(bufstr));
753 28 : nulls[i] = false;
754 28 : i++;
755 : }
756 :
757 : /* And form an array */
758 480 : dims[0] = i;
759 480 : lbs[0] = 1;
760 : /* XXX: this hardcodes assumptions about the text type */
761 480 : return construct_md_array(elems, nulls, 1, dims, lbs,
762 : TEXTOID, -1, false, TYPALIGN_INT);
763 : }
|