TLA Line data Source code
1 : /*
2 : * re_*exec and friends - match REs
3 : *
4 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 : *
6 : * Development of this software was funded, in part, by Cray Research Inc.,
7 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 : * Corporation, none of whom are responsible for the results. The author
9 : * thanks all of them.
10 : *
11 : * Redistribution and use in source and binary forms -- with or without
12 : * modification -- are permitted for any purpose, provided that
13 : * redistributions in source form retain this entire copyright notice and
14 : * indicate the origin and nature of any modifications.
15 : *
16 : * I'd appreciate being given credit for this package in the documentation
17 : * of software which uses it, but that is not a requirement.
18 : *
19 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : * src/backend/regex/regexec.c
31 : *
32 : */
33 :
34 : #include "regex/regguts.h"
35 :
36 :
37 :
38 : /* lazy-DFA representation */
39 : struct arcp
40 : { /* "pointer" to an outarc */
41 : struct sset *ss;
42 : color co;
43 : };
44 :
45 : struct sset
46 : { /* state set */
47 : unsigned *states; /* pointer to bitvector */
48 : unsigned hash; /* hash of bitvector */
49 : #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 : #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 : memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 : int flags;
53 : #define STARTER 01 /* the initial state set */
54 : #define POSTSTATE 02 /* includes the goal state */
55 : #define LOCKED 04 /* locked in cache */
56 : #define NOPROGRESS 010 /* zero-progress state set */
57 : struct arcp ins; /* chain of inarcs pointing here */
58 : chr *lastseen; /* last entered on arrival here */
59 : struct sset **outs; /* outarc vector indexed by color */
60 : struct arcp *inchain; /* chain-pointer vector for outarcs */
61 : };
62 :
63 : struct dfa
64 : {
65 : int nssets; /* size of cache */
66 : int nssused; /* how many entries occupied yet */
67 : int nstates; /* number of states */
68 : int ncolors; /* length of outarc and inchain vectors */
69 : int wordsper; /* length of state-set bitvectors */
70 : struct sset *ssets; /* state-set cache */
71 : unsigned *statesarea; /* bitvector storage */
72 : unsigned *work; /* pointer to work area within statesarea */
73 : struct sset **outsarea; /* outarc-vector storage */
74 : struct arcp *incarea; /* inchain storage */
75 : struct cnfa *cnfa;
76 : struct colormap *cm;
77 : chr *lastpost; /* location of last cache-flushed success */
78 : chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 : struct sset *search; /* replacement-search-pointer memory */
80 : int backno; /* if DFA for a backref, subno it refers to */
81 : short backmin; /* min repetitions for backref */
82 : short backmax; /* max repetitions for backref */
83 : bool ismalloced; /* should this struct dfa be freed? */
84 : bool arraysmalloced; /* should its subsidiary arrays be freed? */
85 : };
86 :
87 : #define WORK 1 /* number of work bitvectors needed */
88 :
89 : /* setup for non-malloc allocation for small cases */
90 : #define FEWSTATES 20 /* must be less than UBITS */
91 : #define FEWCOLORS 15
92 : struct smalldfa
93 : {
94 : struct dfa dfa; /* must be first */
95 : struct sset ssets[FEWSTATES * 2];
96 : unsigned statesarea[FEWSTATES * 2 + WORK];
97 : struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
98 : struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
99 : };
100 :
101 : #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
102 :
103 :
104 :
105 : /* internal variables, bundled for easy passing around */
106 : struct vars
107 : {
108 : regex_t *re;
109 : struct guts *g;
110 : int eflags; /* copies of arguments */
111 : size_t nmatch;
112 : regmatch_t *pmatch;
113 : rm_detail_t *details;
114 : chr *start; /* start of string */
115 : chr *search_start; /* search start of string */
116 : chr *stop; /* just past end of string */
117 : int err; /* error code if any (0 none) */
118 : struct dfa **subdfas; /* per-tree-subre DFAs */
119 : struct dfa **ladfas; /* per-lacon-subre DFAs */
120 : struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
121 : chr **lblastcp; /* per-lacon-subre lookbehind restart data */
122 : struct smalldfa dfa1;
123 : struct smalldfa dfa2;
124 : };
125 :
126 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
127 : #define ISERR() VISERR(v)
128 : #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
129 : #define ERR(e) VERR(v, e) /* record an error */
130 : #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
131 : #define OFF(p) ((p) - v->start)
132 : #define LOFF(p) ((long)OFF(p))
133 :
134 :
135 :
136 : /*
137 : * forward declarations
138 : */
139 : /* === regexec.c === */
140 : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
141 : static struct dfa *getladfa(struct vars *v, int n);
142 : static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
143 : static int cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
144 : static int cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
145 : struct dfa *d, struct dfa *s, chr **coldp);
146 : static void zapallsubs(regmatch_t *p, size_t n);
147 : static void zaptreesubs(struct vars *v, struct subre *t);
148 : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
149 : static int cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
150 : static int ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
151 : static int crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
152 : static int cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
153 : static int caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
154 : static int citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
155 : static int creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
156 :
157 : /* === rege_dfa.c === */
158 : static chr *longest(struct vars *v, struct dfa *d,
159 : chr *start, chr *stop, int *hitstopp);
160 : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
161 : chr *max, chr **coldp, int *hitstopp);
162 : static int matchuntil(struct vars *v, struct dfa *d, chr *probe,
163 : struct sset **lastcss, chr **lastcp);
164 : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
165 : chr *min, chr *max, bool shortest);
166 : static chr *lastcold(struct vars *v, struct dfa *d);
167 : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
168 : struct colormap *cm, struct smalldfa *sml);
169 : static void freedfa(struct dfa *d);
170 : static unsigned hash(unsigned *uv, int n);
171 : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
172 : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
173 : color co, chr *cp, chr *start);
174 : static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
175 : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
176 : chr *start);
177 : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
178 : chr *start);
179 :
180 :
181 : /*
182 : * pg_regexec - match regular expression
183 : */
184 : int
185 GIC 901896 : pg_regexec(regex_t *re,
186 : const chr *string,
187 : size_t len,
188 : size_t search_start,
189 : rm_detail_t *details,
190 : size_t nmatch,
191 : regmatch_t pmatch[],
192 : int flags)
193 : {
194 ECB : struct vars var;
195 GNC 901896 : struct vars *v = &var;
196 : int st;
197 : size_t n;
198 : size_t i;
199 : int backref;
200 :
201 : #define LOCALMAT 20
202 : regmatch_t mat[LOCALMAT];
203 :
204 ECB : #define LOCALDFAS 40
205 : struct dfa *subdfas[LOCALDFAS];
206 :
207 : /* sanity checks */
208 GIC 901896 : if (re == NULL || string == NULL || re->re_magic != REMAGIC)
209 UIC 0 : return REG_INVARG;
210 GIC 901896 : if (re->re_csize != sizeof(chr))
211 UIC 0 : return REG_MIXED;
212 GIC 901896 : if (search_start > len)
213 3 : return REG_NOMATCH;
214 :
215 : /* Initialize locale-dependent support */
216 901893 : pg_set_regex_collation(re->re_collation);
217 ECB :
218 EUB : /* setup */
219 CBC 901893 : v->re = re;
220 GBC 901893 : v->g = (struct guts *) re->re_guts;
221 CBC 901893 : if ((v->g->cflags & REG_EXPECT) && details == NULL)
222 LBC 0 : return REG_INVARG;
223 GIC 901893 : if (v->g->info & REG_UIMPOSSIBLE)
224 715 : return REG_NOMATCH;
225 CBC 901178 : backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
226 GIC 901178 : v->eflags = flags;
227 901178 : if (backref && nmatch <= v->g->nsub)
228 ECB : {
229 : /* need larger work area */
230 CBC 91 : v->nmatch = v->g->nsub + 1;
231 GBC 91 : if (v->nmatch <= LOCALMAT)
232 CBC 90 : v->pmatch = mat;
233 ECB : else
234 CBC 1 : v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
235 91 : if (v->pmatch == NULL)
236 LBC 0 : return REG_ESPACE;
237 GIC 91 : zapallsubs(v->pmatch, v->nmatch);
238 : }
239 ECB : else
240 : {
241 : /* we can store results directly in caller's array */
242 GIC 901087 : v->pmatch = pmatch;
243 ECB : /* ensure any extra entries in caller's array are filled with -1 */
244 CBC 901087 : if (nmatch > 0)
245 GBC 556472 : zapallsubs(pmatch, nmatch);
246 ECB : /* then forget about extra entries, to avoid useless work in find() */
247 GIC 901087 : if (nmatch > v->g->nsub + 1)
248 677 : nmatch = v->g->nsub + 1;
249 901087 : v->nmatch = nmatch;
250 : }
251 CBC 901178 : v->details = details;
252 GIC 901178 : v->start = (chr *) string;
253 CBC 901178 : v->search_start = (chr *) string + search_start;
254 901178 : v->stop = (chr *) string + len;
255 GIC 901178 : v->err = 0;
256 CBC 901178 : v->subdfas = NULL;
257 901178 : v->ladfas = NULL;
258 901178 : v->lblastcss = NULL;
259 GIC 901178 : v->lblastcp = NULL;
260 ECB : /* below this point, "goto cleanup" will behave sanely */
261 :
262 CBC 901178 : assert(v->g->ntree >= 0);
263 901178 : n = (size_t) v->g->ntree;
264 901178 : if (n <= LOCALDFAS)
265 901176 : v->subdfas = subdfas;
266 ECB : else
267 : {
268 CBC 2 : v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
269 GIC 2 : if (v->subdfas == NULL)
270 : {
271 LBC 0 : st = REG_ESPACE;
272 0 : goto cleanup;
273 ECB : }
274 : }
275 GIC 2719024 : for (i = 0; i < n; i++)
276 1817846 : v->subdfas[i] = NULL;
277 ECB :
278 CBC 901178 : assert(v->g->nlacons >= 0);
279 GIC 901178 : n = (size_t) v->g->nlacons;
280 GBC 901178 : if (n > 0)
281 EUB : {
282 GIC 630 : v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
283 630 : if (v->ladfas == NULL)
284 ECB : {
285 LBC 0 : st = REG_ESPACE;
286 UIC 0 : goto cleanup;
287 ECB : }
288 CBC 1922 : for (i = 0; i < n; i++)
289 1292 : v->ladfas[i] = NULL;
290 GIC 630 : v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
291 CBC 630 : v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
292 630 : if (v->lblastcss == NULL || v->lblastcp == NULL)
293 : {
294 UBC 0 : st = REG_ESPACE;
295 0 : goto cleanup;
296 : }
297 CBC 1922 : for (i = 0; i < n; i++)
298 ECB : {
299 CBC 1292 : v->lblastcss[i] = NULL;
300 1292 : v->lblastcp[i] = NULL;
301 ECB : }
302 : }
303 EUB :
304 : /* do it */
305 GIC 901178 : assert(v->g->tree != NULL);
306 CBC 901178 : if (backref)
307 GIC 145 : st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
308 ECB : else
309 CBC 901033 : st = find(v, &v->g->tree->cnfa, &v->g->cmap);
310 :
311 : /* on success, ensure caller's match vector is filled correctly */
312 GIC 901178 : if (st == REG_OKAY && nmatch > 0)
313 : {
314 CBC 451389 : if (v->pmatch != pmatch)
315 ECB : {
316 : /* copy portion of match vector over from (larger) work area */
317 GIC 21 : assert(nmatch <= v->nmatch);
318 CBC 21 : memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
319 : }
320 GIC 451389 : if (v->g->cflags & REG_NOSUB)
321 ECB : {
322 : /* don't expose possibly-partial sub-match results to caller */
323 CBC 448309 : zapallsubs(pmatch, nmatch);
324 : }
325 : }
326 ECB :
327 : /* clean up */
328 GIC 452869 : cleanup:
329 CBC 901178 : if (v->pmatch != pmatch && v->pmatch != mat)
330 GIC 1 : FREE(v->pmatch);
331 901178 : if (v->subdfas != NULL)
332 ECB : {
333 GIC 901178 : n = (size_t) v->g->ntree;
334 2719024 : for (i = 0; i < n; i++)
335 : {
336 1817846 : if (v->subdfas[i] != NULL)
337 CBC 12368 : freedfa(v->subdfas[i]);
338 ECB : }
339 CBC 901178 : if (v->subdfas != subdfas)
340 2 : FREE(v->subdfas);
341 : }
342 901178 : if (v->ladfas != NULL)
343 ECB : {
344 GIC 630 : n = (size_t) v->g->nlacons;
345 CBC 1922 : for (i = 0; i < n; i++)
346 ECB : {
347 GIC 1292 : if (v->ladfas[i] != NULL)
348 CBC 53 : freedfa(v->ladfas[i]);
349 ECB : }
350 GIC 630 : FREE(v->ladfas);
351 ECB : }
352 GIC 901178 : if (v->lblastcss != NULL)
353 CBC 630 : FREE(v->lblastcss);
354 901178 : if (v->lblastcp != NULL)
355 GIC 630 : FREE(v->lblastcp);
356 ECB :
357 : #ifdef REG_DEBUG
358 : if (v->eflags & (REG_FTRACE | REG_MTRACE))
359 : fflush(stdout);
360 : #endif
361 :
362 CBC 901178 : return st;
363 ECB : }
364 :
365 : /*
366 : * getsubdfa - create or re-fetch the DFA for a tree subre node
367 : *
368 : * We only need to create the DFA once per overall regex execution.
369 : * The DFA will be freed by the cleanup step in pg_regexec().
370 : */
371 : static struct dfa *
372 GIC 13054 : getsubdfa(struct vars *v,
373 : struct subre *t)
374 : {
375 13054 : struct dfa *d = v->subdfas[t->id];
376 :
377 13054 : if (d == NULL)
378 : {
379 12368 : d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
380 12368 : if (d == NULL)
381 LBC 0 : return NULL;
382 : /* set up additional info if this is a backref node */
383 GIC 12368 : if (t->op == 'b')
384 ECB : {
385 GIC 140 : d->backno = t->backno;
386 CBC 140 : d->backmin = t->min;
387 GIC 140 : d->backmax = t->max;
388 ECB : }
389 CBC 12368 : v->subdfas[t->id] = d;
390 EUB : }
391 GIC 13054 : return d;
392 ECB : }
393 :
394 : /*
395 : * getladfa - create or re-fetch the DFA for a LACON subre node
396 : *
397 : * Same as above, but for LACONs.
398 : */
399 : static struct dfa *
400 CBC 120 : getladfa(struct vars *v,
401 : int n)
402 : {
403 GIC 120 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
404 :
405 120 : if (v->ladfas[n] == NULL)
406 : {
407 53 : struct subre *sub = &v->g->lacons[n];
408 :
409 CBC 53 : v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
410 : /* a LACON can't contain a backref, so nothing else to do */
411 : }
412 120 : return v->ladfas[n];
413 : }
414 ECB :
415 : /*
416 : * find - find a match for the main NFA (no-complications case)
417 : */
418 : static int
419 GIC 901033 : find(struct vars *v,
420 : struct cnfa *cnfa,
421 ECB : struct colormap *cm)
422 : {
423 : struct dfa *s;
424 : struct dfa *d;
425 : chr *begin;
426 GIC 901033 : chr *end = NULL;
427 : chr *cold;
428 ECB : chr *open; /* open and close of range of possible starts */
429 : chr *close;
430 : int hitend;
431 GIC 901033 : int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
432 :
433 : /* first, a shot with the search RE */
434 901033 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
435 CBC 901033 : if (s == NULL)
436 UIC 0 : return v->err;
437 : MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
438 GIC 901033 : cold = NULL;
439 901033 : close = shortest(v, s, v->search_start, v->search_start, v->stop,
440 ECB : &cold, (int *) NULL);
441 GIC 901033 : freedfa(s);
442 901033 : NOERR();
443 CBC 901033 : if (v->g->cflags & REG_EXPECT)
444 ECB : {
445 GBC 27 : assert(v->details != NULL);
446 GIC 27 : if (cold != NULL)
447 CBC 27 : v->details->rm_extend.rm_so = OFF(cold);
448 ECB : else
449 UIC 0 : v->details->rm_extend.rm_so = OFF(v->stop);
450 CBC 27 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
451 ECB : }
452 CBC 901033 : if (close == NULL) /* not found */
453 GIC 330758 : return REG_NOMATCH;
454 CBC 570275 : if (v->nmatch == 0) /* found, don't need exact location */
455 118944 : return REG_OKAY;
456 ECB :
457 : /* find starting point and match */
458 GBC 451331 : assert(cold != NULL);
459 CBC 451331 : open = cold;
460 GIC 451331 : cold = NULL;
461 ECB : MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
462 CBC 451331 : d = newdfa(v, cnfa, cm, &v->dfa1);
463 451331 : if (d == NULL)
464 LBC 0 : return v->err;
465 GIC 451360 : for (begin = open; begin <= close; begin++)
466 : {
467 ECB : MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
468 CBC 451360 : if (shorter)
469 61 : end = shortest(v, d, begin, begin, v->stop,
470 : (chr **) NULL, &hitend);
471 ECB : else
472 CBC 451299 : end = longest(v, d, begin, v->stop, &hitend);
473 GBC 451360 : if (ISERR())
474 ECB : {
475 UIC 0 : freedfa(d);
476 0 : return v->err;
477 ECB : }
478 CBC 451360 : if (hitend && cold == NULL)
479 GIC 3061 : cold = begin;
480 451360 : if (end != NULL)
481 CBC 451331 : break; /* NOTE BREAK OUT */
482 ECB : }
483 GIC 451331 : assert(end != NULL); /* search RE succeeded so loop should */
484 GBC 451331 : freedfa(d);
485 EUB :
486 : /* and pin down details */
487 CBC 451331 : assert(v->nmatch > 0);
488 451331 : v->pmatch[0].rm_so = OFF(begin);
489 451331 : v->pmatch[0].rm_eo = OFF(end);
490 451331 : if (v->g->cflags & REG_EXPECT)
491 : {
492 10 : if (cold != NULL)
493 5 : v->details->rm_extend.rm_so = OFF(cold);
494 : else
495 GIC 5 : v->details->rm_extend.rm_so = OFF(v->stop);
496 CBC 10 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
497 ECB : }
498 CBC 451331 : if (v->nmatch == 1) /* no need for submatches */
499 449125 : return REG_OKAY;
500 :
501 ECB : /* find submatches */
502 CBC 2206 : return cdissect(v, v->g->tree, begin, end);
503 : }
504 ECB :
505 : /*
506 : * cfind - find a match for the main NFA (with complications)
507 : */
508 : static int
509 GIC 145 : cfind(struct vars *v,
510 : struct cnfa *cnfa,
511 ECB : struct colormap *cm)
512 : {
513 : struct dfa *s;
514 : struct dfa *d;
515 : chr *cold;
516 : int ret;
517 :
518 CBC 145 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
519 GIC 145 : if (s == NULL)
520 UIC 0 : return v->err;
521 GIC 145 : d = newdfa(v, cnfa, cm, &v->dfa2);
522 145 : if (d == NULL)
523 : {
524 UIC 0 : freedfa(s);
525 0 : return v->err;
526 : }
527 ECB :
528 CBC 145 : ret = cfindloop(v, cnfa, cm, d, s, &cold);
529 EUB :
530 CBC 145 : freedfa(d);
531 145 : freedfa(s);
532 GIC 145 : NOERR();
533 GBC 145 : if (v->g->cflags & REG_EXPECT)
534 EUB : {
535 UIC 0 : assert(v->details != NULL);
536 0 : if (cold != NULL)
537 LBC 0 : v->details->rm_extend.rm_so = OFF(cold);
538 : else
539 0 : v->details->rm_extend.rm_so = OFF(v->stop);
540 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
541 ECB : }
542 CBC 145 : return ret;
543 : }
544 EUB :
545 : /*
546 : * cfindloop - the heart of cfind
547 : */
548 : static int
549 GBC 145 : cfindloop(struct vars *v,
550 : struct cnfa *cnfa,
551 ECB : struct colormap *cm,
552 : struct dfa *d,
553 : struct dfa *s,
554 : chr **coldp) /* where to put coldstart pointer */
555 : {
556 : chr *begin;
557 : chr *end;
558 : chr *cold;
559 : chr *open; /* open and close of range of possible starts */
560 : chr *close;
561 : chr *estart;
562 : chr *estop;
563 : int er;
564 GIC 145 : int shorter = v->g->tree->flags & SHORTER;
565 : int hitend;
566 :
567 145 : assert(d != NULL && s != NULL);
568 145 : cold = NULL;
569 145 : close = v->search_start;
570 : do
571 : {
572 : /* Search with the search RE for match range at/beyond "close" */
573 ECB : MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
574 GIC 161 : close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
575 161 : if (ISERR())
576 ECB : {
577 LBC 0 : *coldp = cold;
578 0 : return v->err;
579 : }
580 GIC 161 : if (close == NULL)
581 15 : break; /* no more possible match anywhere */
582 146 : assert(cold != NULL);
583 CBC 146 : open = cold;
584 146 : cold = NULL;
585 : /* Search for matches starting between "open" and "close" inclusive */
586 EUB : MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
587 GBC 513 : for (begin = open; begin <= close; begin++)
588 : {
589 ECB : MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
590 CBC 452 : estart = begin;
591 452 : estop = v->stop;
592 ECB : for (;;)
593 : {
594 : /* Here we use the top node's detailed RE */
595 GIC 646 : if (shorter)
596 CBC 97 : end = shortest(v, d, begin, estart,
597 : estop, (chr **) NULL, &hitend);
598 : else
599 549 : end = longest(v, d, begin, estop,
600 ECB : &hitend);
601 GIC 646 : if (ISERR())
602 : {
603 UIC 0 : *coldp = cold;
604 LBC 0 : return v->err;
605 ECB : }
606 GIC 646 : if (hitend && cold == NULL)
607 104 : cold = begin;
608 CBC 646 : if (end == NULL)
609 GIC 349 : break; /* no match with this begin point, try next */
610 ECB : MDEBUG(("tentative end %ld\n", LOFF(end)));
611 : /* Dissect the potential match to see if it really matches */
612 GBC 297 : er = cdissect(v, v->g->tree, begin, end);
613 297 : if (er == REG_OKAY)
614 : {
615 CBC 85 : if (v->nmatch > 0)
616 ECB : {
617 CBC 85 : v->pmatch[0].rm_so = OFF(begin);
618 85 : v->pmatch[0].rm_eo = OFF(end);
619 : }
620 GIC 85 : *coldp = cold;
621 CBC 85 : return REG_OKAY;
622 ECB : }
623 GIC 212 : if (er != REG_NOMATCH)
624 ECB : {
625 UIC 0 : ERR(er);
626 LBC 0 : *coldp = cold;
627 0 : return er;
628 : }
629 ECB : /* Try next longer/shorter match with same begin point */
630 CBC 212 : if (shorter)
631 : {
632 81 : if (end == estop)
633 GIC 12 : break; /* no more, so try next begin point */
634 GBC 69 : estart = end + 1;
635 EUB : }
636 : else
637 : {
638 GIC 131 : if (end == begin)
639 CBC 6 : break; /* no more, so try next begin point */
640 GIC 125 : estop = end - 1;
641 ECB : }
642 : } /* end loop over endpoint positions */
643 : } /* end loop over beginning positions */
644 :
645 : /*
646 : * If we get here, there is no possible match starting at or before
647 : * "close", so consider matches beyond that. We'll do a fresh search
648 : * with the search RE to find a new promising match range.
649 : */
650 GIC 61 : close++;
651 61 : } while (close < v->stop);
652 :
653 60 : *coldp = cold;
654 60 : return REG_NOMATCH;
655 : }
656 :
657 : /*
658 : * zapallsubs - initialize all subexpression matches to "no match"
659 ECB : *
660 : * Note that p[0], the overall-match location, is not touched.
661 : */
662 : static void
663 CBC 1004872 : zapallsubs(regmatch_t *p,
664 : size_t n)
665 : {
666 : size_t i;
667 :
668 GIC 1012596 : for (i = n - 1; i > 0; i--)
669 : {
670 7724 : p[i].rm_so = -1;
671 7724 : p[i].rm_eo = -1;
672 ECB : }
673 GIC 1004872 : }
674 :
675 : /*
676 : * zaptreesubs - initialize subexpressions within subtree to "no match"
677 ECB : */
678 : static void
679 CBC 520 : zaptreesubs(struct vars *v,
680 ECB : struct subre *t)
681 : {
682 CBC 520 : int n = t->capno;
683 : struct subre *t2;
684 :
685 GIC 520 : if (n > 0)
686 : {
687 243 : if ((size_t) n < v->nmatch)
688 ECB : {
689 GIC 243 : v->pmatch[n].rm_so = -1;
690 243 : v->pmatch[n].rm_eo = -1;
691 ECB : }
692 : }
693 :
694 CBC 695 : for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
695 GIC 175 : zaptreesubs(v, t2);
696 CBC 520 : }
697 :
698 ECB : /*
699 : * subset - set subexpression match data for a successful subre
700 : */
701 : static void
702 GIC 4059 : subset(struct vars *v,
703 ECB : struct subre *sub,
704 : chr *begin,
705 : chr *end)
706 : {
707 GIC 4059 : int n = sub->capno;
708 :
709 4059 : assert(n > 0);
710 4059 : if ((size_t) n >= v->nmatch)
711 CBC 9 : return;
712 :
713 : MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
714 GIC 4050 : v->pmatch[n].rm_so = OFF(begin);
715 4050 : v->pmatch[n].rm_eo = OFF(end);
716 ECB : }
717 :
718 : /*
719 : * cdissect - check backrefs and determine subexpression matches
720 : *
721 : * cdissect recursively processes a subre tree to check matching of backrefs
722 : * and/or identify submatch boundaries for capture nodes. The proposed match
723 : * runs from "begin" to "end" (not including "end"), and we are basically
724 : * "dissecting" it to see where the submatches are.
725 : *
726 : * Before calling any level of cdissect, the caller must have run the node's
727 : * DFA and found that the proposed substring satisfies the DFA. (We make
728 : * the caller do that because in concatenation and iteration nodes, it's
729 : * much faster to check all the substrings against the child DFAs before we
730 : * recurse.)
731 : *
732 : * A side-effect of a successful match is to save match locations for
733 : * capturing subexpressions in v->pmatch[]. This is a little bit tricky,
734 : * so we make the following rules:
735 : * 1. Before initial entry to cdissect, all match data must have been
736 : * cleared (this is seen to by zapallsubs).
737 : * 2. Before any recursive entry to cdissect, the match data for that
738 : * subexpression tree must be guaranteed clear (see zaptreesubs).
739 : * 3. When returning REG_OKAY, each level of cdissect will have saved
740 : * any relevant match locations.
741 : * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
742 : * that its subexpression match locations are again clear.
743 : * 5. No guarantees are made for error cases (i.e., other result codes).
744 : * 6. When a level of cdissect abandons a successful sub-match, it will
745 : * clear that subtree's match locations with zaptreesubs before trying
746 : * any new DFA match or cdissect call for that subtree or any subtree
747 : * to its right (that is, any subtree that could have a backref into the
748 : * abandoned match).
749 : * This may seem overly complicated, but it's difficult to simplify it
750 : * because of the provision that match locations must be reset before
751 : * any fresh DFA match (a rule that is needed to make dfa_backref safe).
752 : * That means it won't work to just reset relevant match locations at the
753 : * start of each cdissect level.
754 : */
755 : static int /* regexec return code */
756 GIC 15248 : cdissect(struct vars *v,
757 : struct subre *t,
758 : chr *begin, /* beginning of relevant substring */
759 : chr *end) /* end of same */
760 : {
761 : int er;
762 :
763 15248 : assert(t != NULL);
764 : MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
765 ECB :
766 : /* handy place to check for operation cancel */
767 GNC 15248 : INTERRUPT(v->re);
768 : /* ... and stack overrun */
769 GIC 15248 : if (STACK_TOO_DEEP(v->re))
770 UIC 0 : return REG_ETOOBIG;
771 ECB :
772 GIC 15248 : switch (t->op)
773 : {
774 8407 : case '=': /* terminal node */
775 CBC 8407 : assert(t->child == NULL);
776 GIC 8407 : er = REG_OKAY; /* no action, parent did the work */
777 CBC 8407 : break;
778 GBC 209 : case 'b': /* back reference */
779 GIC 209 : assert(t->child == NULL);
780 CBC 209 : er = cbrdissect(v, t, begin, end);
781 GIC 209 : break;
782 CBC 6422 : case '.': /* concatenation */
783 6422 : assert(t->child != NULL);
784 6422 : if (t->child->flags & SHORTER) /* reverse scan */
785 164 : er = crevcondissect(v, t, begin, end);
786 ECB : else
787 CBC 6258 : er = ccondissect(v, t, begin, end);
788 6422 : break;
789 68 : case '|': /* alternation */
790 68 : assert(t->child != NULL);
791 68 : er = caltdissect(v, t, begin, end);
792 68 : break;
793 112 : case '*': /* iteration */
794 GIC 112 : assert(t->child != NULL);
795 CBC 112 : if (t->child->flags & SHORTER) /* reverse scan */
796 7 : er = creviterdissect(v, t, begin, end);
797 ECB : else
798 CBC 105 : er = citerdissect(v, t, begin, end);
799 112 : break;
800 30 : case '(': /* no-op capture node */
801 30 : assert(t->child != NULL);
802 30 : er = cdissect(v, t->child, begin, end);
803 30 : break;
804 LBC 0 : default:
805 UIC 0 : er = REG_ASSERT;
806 LBC 0 : break;
807 ECB : }
808 :
809 : /*
810 : * We should never have a match failure unless backrefs lurk below;
811 : * otherwise, either caller failed to check the DFA, or there's some
812 EUB : * inconsistency between the DFA and the node's innards.
813 : */
814 GBC 15248 : assert(er != REG_NOMATCH || (t->flags & BACKR));
815 :
816 : /*
817 : * If this node is marked as capturing, save successful match's location.
818 : */
819 GIC 15248 : if (t->capno > 0 && er == REG_OKAY)
820 4059 : subset(v, t, begin, end);
821 :
822 CBC 15248 : return er;
823 : }
824 :
825 : /*
826 : * ccondissect - dissect match for concatenation node
827 ECB : */
828 : static int /* regexec return code */
829 GIC 6258 : ccondissect(struct vars *v,
830 ECB : struct subre *t,
831 : chr *begin, /* beginning of relevant substring */
832 : chr *end) /* end of same */
833 : {
834 GIC 6258 : struct subre *left = t->child;
835 6258 : struct subre *right = left->sibling;
836 : struct dfa *d;
837 ECB : struct dfa *d2;
838 : chr *mid;
839 : int er;
840 :
841 GIC 6258 : assert(t->op == '.');
842 CBC 6258 : assert(left != NULL && left->cnfa.nstates > 0);
843 6258 : assert(right != NULL && right->cnfa.nstates > 0);
844 GIC 6258 : assert(right->sibling == NULL);
845 6258 : assert(!(left->flags & SHORTER));
846 :
847 6258 : d = getsubdfa(v, left);
848 6258 : NOERR();
849 CBC 6258 : d2 = getsubdfa(v, right);
850 6258 : NOERR();
851 ECB : MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
852 :
853 : /* pick a tentative midpoint */
854 GIC 6258 : mid = longest(v, d, begin, end, (int *) NULL);
855 CBC 6258 : NOERR();
856 6258 : if (mid == NULL)
857 8 : return REG_NOMATCH;
858 ECB : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
859 :
860 : /* iterate until satisfaction or failure */
861 : for (;;)
862 : {
863 : /* try this midpoint on for size */
864 CBC 7772 : if (longest(v, d2, mid, end, (int *) NULL) == end)
865 ECB : {
866 GIC 6209 : er = cdissect(v, left, begin, mid);
867 6209 : if (er == REG_OKAY)
868 : {
869 6159 : er = cdissect(v, right, mid, end);
870 6159 : if (er == REG_OKAY)
871 : {
872 ECB : /* satisfaction */
873 : MDEBUG(("%d: successful\n", t->id));
874 CBC 5911 : return REG_OKAY;
875 ECB : }
876 : /* Reset left's matches (right should have done so itself) */
877 CBC 248 : zaptreesubs(v, left);
878 ECB : }
879 GIC 298 : if (er != REG_NOMATCH)
880 UIC 0 : return er;
881 : }
882 CBC 1861 : NOERR();
883 :
884 : /* that midpoint didn't work, find a new one */
885 1861 : if (mid == begin)
886 : {
887 ECB : /* all possibilities exhausted */
888 EUB : MDEBUG(("%d: no midpoint\n", t->id));
889 GIC 75 : return REG_NOMATCH;
890 ECB : }
891 GIC 1786 : mid = longest(v, d, begin, mid - 1, (int *) NULL);
892 1786 : NOERR();
893 CBC 1786 : if (mid == NULL)
894 : {
895 : /* failed to find a new one */
896 : MDEBUG(("%d: failed midpoint\n", t->id));
897 264 : return REG_NOMATCH;
898 : }
899 ECB : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
900 : }
901 :
902 : /* can't get here */
903 : return REG_ASSERT;
904 : }
905 :
906 : /*
907 : * crevcondissect - dissect match for concatenation node, shortest-first
908 : */
909 : static int /* regexec return code */
910 GIC 164 : crevcondissect(struct vars *v,
911 : struct subre *t,
912 : chr *begin, /* beginning of relevant substring */
913 : chr *end) /* end of same */
914 : {
915 164 : struct subre *left = t->child;
916 164 : struct subre *right = left->sibling;
917 : struct dfa *d;
918 ECB : struct dfa *d2;
919 : chr *mid;
920 : int er;
921 :
922 GIC 164 : assert(t->op == '.');
923 CBC 164 : assert(left != NULL && left->cnfa.nstates > 0);
924 164 : assert(right != NULL && right->cnfa.nstates > 0);
925 GIC 164 : assert(right->sibling == NULL);
926 164 : assert(left->flags & SHORTER);
927 :
928 164 : d = getsubdfa(v, left);
929 164 : NOERR();
930 CBC 164 : d2 = getsubdfa(v, right);
931 164 : NOERR();
932 ECB : MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
933 :
934 : /* pick a tentative midpoint */
935 GIC 164 : mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
936 CBC 164 : NOERR();
937 164 : if (mid == NULL)
938 LBC 0 : return REG_NOMATCH;
939 ECB : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
940 :
941 : /* iterate until satisfaction or failure */
942 : for (;;)
943 : {
944 : /* try this midpoint on for size */
945 CBC 605 : if (longest(v, d2, mid, end, (int *) NULL) == end)
946 EUB : {
947 GIC 93 : er = cdissect(v, left, begin, mid);
948 93 : if (er == REG_OKAY)
949 : {
950 93 : er = cdissect(v, right, mid, end);
951 93 : if (er == REG_OKAY)
952 : {
953 ECB : /* satisfaction */
954 : MDEBUG(("%d: successful\n", t->id));
955 CBC 81 : return REG_OKAY;
956 ECB : }
957 : /* Reset left's matches (right should have done so itself) */
958 CBC 12 : zaptreesubs(v, left);
959 ECB : }
960 GIC 12 : if (er != REG_NOMATCH)
961 UIC 0 : return er;
962 : }
963 CBC 524 : NOERR();
964 :
965 : /* that midpoint didn't work, find a new one */
966 524 : if (mid == end)
967 : {
968 ECB : /* all possibilities exhausted */
969 EUB : MDEBUG(("%d: no midpoint\n", t->id));
970 GIC 83 : return REG_NOMATCH;
971 ECB : }
972 GIC 441 : mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
973 441 : NOERR();
974 CBC 441 : if (mid == NULL)
975 : {
976 : /* failed to find a new one */
977 : MDEBUG(("%d: failed midpoint\n", t->id));
978 LBC 0 : return REG_NOMATCH;
979 : }
980 ECB : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
981 : }
982 :
983 : /* can't get here */
984 : return REG_ASSERT;
985 : }
986 EUB :
987 : /*
988 : * cbrdissect - dissect match for backref node
989 : *
990 : * The backref match might already have been verified by dfa_backref(),
991 : * but we don't know that for sure so must check it here.
992 : */
993 : static int /* regexec return code */
994 GIC 209 : cbrdissect(struct vars *v,
995 : struct subre *t,
996 : chr *begin, /* beginning of relevant substring */
997 : chr *end) /* end of same */
998 : {
999 209 : int n = t->backno;
1000 : size_t numreps;
1001 : size_t tlen;
1002 ECB : size_t brlen;
1003 : chr *brstring;
1004 : chr *p;
1005 GIC 209 : int min = t->min;
1006 209 : int max = t->max;
1007 ECB :
1008 GIC 209 : assert(t != NULL);
1009 209 : assert(t->op == 'b');
1010 209 : assert(n >= 0);
1011 209 : assert((size_t) n < v->nmatch);
1012 :
1013 ECB : MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
1014 : LOFF(begin), LOFF(end)));
1015 :
1016 : /* get the backreferenced string */
1017 CBC 209 : if (v->pmatch[n].rm_so == -1)
1018 55 : return REG_NOMATCH;
1019 154 : brstring = v->start + v->pmatch[n].rm_so;
1020 GIC 154 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1021 :
1022 : /* special cases for zero-length strings */
1023 154 : if (brlen == 0)
1024 : {
1025 ECB : /*
1026 : * matches only if target is zero length, but any number of
1027 : * repetitions can be considered to be present
1028 : */
1029 GIC 7 : if (begin == end && min <= max)
1030 : {
1031 ECB : MDEBUG(("%d: backref matched trivially\n", t->id));
1032 GIC 7 : return REG_OKAY;
1033 : }
1034 UIC 0 : return REG_NOMATCH;
1035 : }
1036 GIC 147 : if (begin == end)
1037 ECB : {
1038 : /* matches only if zero repetitions are okay */
1039 GIC 6 : if (min == 0)
1040 ECB : {
1041 : MDEBUG(("%d: backref matched trivially\n", t->id));
1042 GBC 5 : return REG_OKAY;
1043 : }
1044 CBC 1 : return REG_NOMATCH;
1045 : }
1046 :
1047 ECB : /*
1048 : * check target length to see if it could possibly be an allowed number of
1049 : * repetitions of brstring
1050 : */
1051 GIC 141 : assert(end > begin);
1052 CBC 141 : tlen = end - begin;
1053 GIC 141 : if (tlen % brlen != 0)
1054 5 : return REG_NOMATCH;
1055 136 : numreps = tlen / brlen;
1056 136 : if (numreps < min || (numreps > max && max != DUPINF))
1057 3 : return REG_NOMATCH;
1058 :
1059 ECB : /* okay, compare the actual string contents */
1060 CBC 133 : p = begin;
1061 236 : while (numreps-- > 0)
1062 ECB : {
1063 CBC 151 : if ((*v->g->compare) (brstring, p, brlen) != 0)
1064 48 : return REG_NOMATCH;
1065 103 : p += brlen;
1066 : }
1067 :
1068 ECB : MDEBUG(("%d: backref matched\n", t->id));
1069 CBC 85 : return REG_OKAY;
1070 : }
1071 ECB :
1072 : /*
1073 : * caltdissect - dissect match for alternation node
1074 : */
1075 : static int /* regexec return code */
1076 GIC 68 : caltdissect(struct vars *v,
1077 ECB : struct subre *t,
1078 : chr *begin, /* beginning of relevant substring */
1079 : chr *end) /* end of same */
1080 : {
1081 : struct dfa *d;
1082 : int er;
1083 :
1084 CBC 68 : assert(t->op == '|');
1085 :
1086 GIC 68 : t = t->child;
1087 : /* there should be at least 2 alternatives */
1088 68 : assert(t != NULL && t->sibling != NULL);
1089 :
1090 125 : while (t != NULL)
1091 : {
1092 CBC 99 : assert(t->cnfa.nstates > 0);
1093 :
1094 ECB : MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1095 :
1096 CBC 99 : d = getsubdfa(v, t);
1097 GIC 99 : NOERR();
1098 CBC 99 : if (longest(v, d, begin, end, (int *) NULL) == end)
1099 : {
1100 ECB : MDEBUG(("%d: caltdissect matched\n", t->id));
1101 GIC 76 : er = cdissect(v, t, begin, end);
1102 76 : if (er != REG_NOMATCH)
1103 42 : return er;
1104 ECB : }
1105 CBC 57 : NOERR();
1106 ECB :
1107 GIC 57 : t = t->sibling;
1108 : }
1109 ECB :
1110 CBC 26 : return REG_NOMATCH;
1111 ECB : }
1112 :
1113 : /*
1114 : * citerdissect - dissect match for iteration node
1115 : */
1116 : static int /* regexec return code */
1117 GIC 105 : citerdissect(struct vars *v,
1118 ECB : struct subre *t,
1119 : chr *begin, /* beginning of relevant substring */
1120 : chr *end) /* end of same */
1121 : {
1122 : struct dfa *d;
1123 : chr **endpts;
1124 : chr *limit;
1125 : int min_matches;
1126 : size_t max_matches;
1127 : int nverified;
1128 : int k;
1129 : int i;
1130 : int er;
1131 :
1132 GIC 105 : assert(t->op == '*');
1133 105 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1134 105 : assert(!(t->child->flags & SHORTER));
1135 105 : assert(begin <= end);
1136 :
1137 : MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1138 :
1139 : /*
1140 ECB : * For the moment, assume the minimum number of matches is 1. If zero
1141 : * matches are allowed, and the target string is empty, we are allowed to
1142 : * match regardless of the contents of the iter node --- but we would
1143 : * prefer to match once, so that capturing parens get set. (An example of
1144 : * the concern here is a pattern like "()*\1", which historically this
1145 : * code has allowed to succeed.) Therefore, we deal with the zero-matches
1146 : * case at the bottom, after failing to find any other way to match.
1147 : */
1148 GIC 105 : min_matches = t->min;
1149 105 : if (min_matches <= 0)
1150 69 : min_matches = 1;
1151 :
1152 : /*
1153 : * We need workspace to track the endpoints of each sub-match. Normally
1154 : * we consider only nonzero-length sub-matches, so there can be at most
1155 : * end-begin of them. However, if min is larger than that, we will also
1156 ECB : * consider zero-length sub-matches in order to find enough matches.
1157 : *
1158 : * For convenience, endpts[0] contains the "begin" pointer and we store
1159 : * sub-match endpoints in endpts[1..max_matches].
1160 : */
1161 GIC 105 : max_matches = end - begin;
1162 105 : if (max_matches > t->max && t->max != DUPINF)
1163 UIC 0 : max_matches = t->max;
1164 GIC 105 : if (max_matches < min_matches)
1165 61 : max_matches = min_matches;
1166 105 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1167 105 : if (endpts == NULL)
1168 UIC 0 : return REG_ESPACE;
1169 CBC 105 : endpts[0] = begin;
1170 ECB :
1171 GBC 105 : d = getsubdfa(v, t->child);
1172 CBC 105 : if (ISERR())
1173 ECB : {
1174 LBC 0 : FREE(endpts);
1175 0 : return v->err;
1176 EUB : }
1177 ECB :
1178 : /*
1179 : * Our strategy is to first find a set of sub-match endpoints that are
1180 : * valid according to the child node's DFA, and then recursively dissect
1181 : * each sub-match to confirm validity. If any validity check fails,
1182 EUB : * backtrack that sub-match and try again. And, when we next try for a
1183 : * validity check, we need not recheck any successfully verified
1184 : * sub-matches that we didn't move the endpoints of. nverified remembers
1185 : * how many sub-matches are currently known okay.
1186 : */
1187 :
1188 : /* initialize to consider first sub-match */
1189 GIC 105 : nverified = 0;
1190 105 : k = 1;
1191 105 : limit = end;
1192 :
1193 : /* iterate until satisfaction or failure */
1194 479 : while (k > 0)
1195 : {
1196 : /* try to find an endpoint for the k'th sub-match */
1197 CBC 395 : endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1198 395 : if (ISERR())
1199 ECB : {
1200 UIC 0 : FREE(endpts);
1201 0 : return v->err;
1202 ECB : }
1203 GIC 395 : if (endpts[k] == NULL)
1204 : {
1205 ECB : /* no match possible, so see if we can shorten previous one */
1206 CBC 200 : k--;
1207 GIC 200 : goto backtrack;
1208 EUB : }
1209 : MDEBUG(("%d: working endpoint %d: %ld\n",
1210 : t->id, k, LOFF(endpts[k])));
1211 ECB :
1212 : /* k'th sub-match can no longer be considered verified */
1213 GIC 195 : if (nverified >= k)
1214 CBC 8 : nverified = k - 1;
1215 ECB :
1216 GIC 195 : if (endpts[k] != end)
1217 : {
1218 : /* haven't reached end yet, try another iteration if allowed */
1219 134 : if (k >= max_matches)
1220 : {
1221 ECB : /* must try to shorten some previous match */
1222 LBC 0 : k--;
1223 UIC 0 : goto backtrack;
1224 ECB : }
1225 :
1226 : /* reject zero-length match unless necessary to achieve min */
1227 CBC 134 : if (endpts[k] == endpts[k - 1] &&
1228 UIC 0 : (k >= min_matches || min_matches - k < end - endpts[k]))
1229 0 : goto backtrack;
1230 EUB :
1231 GBC 134 : k++;
1232 GIC 134 : limit = end;
1233 134 : continue;
1234 : }
1235 ECB :
1236 EUB : /*
1237 : * We've identified a way to divide the string into k sub-matches that
1238 : * works so far as the child DFA can tell. If k is an allowed number
1239 ECB : * of matches, start the slow part: recurse to verify each sub-match.
1240 : * We always have k <= max_matches, needn't check that.
1241 : */
1242 GIC 61 : if (k < min_matches)
1243 UIC 0 : goto backtrack;
1244 :
1245 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1246 :
1247 GIC 100 : for (i = nverified + 1; i <= k; i++)
1248 : {
1249 : /* zap any match data from a non-last iteration */
1250 CBC 79 : zaptreesubs(v, t->child);
1251 GBC 79 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1252 GIC 79 : if (er == REG_OKAY)
1253 : {
1254 39 : nverified = i;
1255 CBC 39 : continue;
1256 : }
1257 GIC 40 : if (er == REG_NOMATCH)
1258 CBC 40 : break;
1259 ECB : /* oops, something failed */
1260 LBC 0 : FREE(endpts);
1261 UIC 0 : return er;
1262 ECB : }
1263 :
1264 GIC 61 : if (i > k)
1265 ECB : {
1266 : /* satisfaction */
1267 : MDEBUG(("%d: successful\n", t->id));
1268 GBC 21 : FREE(endpts);
1269 21 : return REG_OKAY;
1270 : }
1271 :
1272 ECB : /* i'th match failed to verify, so backtrack it */
1273 GIC 40 : k = i;
1274 :
1275 240 : backtrack:
1276 ECB :
1277 : /*
1278 : * Must consider shorter versions of the k'th sub-match. However,
1279 : * we'll only ask for a zero-length match if necessary.
1280 : */
1281 CBC 240 : while (k > 0)
1282 : {
1283 156 : chr *prev_end = endpts[k - 1];
1284 :
1285 GIC 156 : if (endpts[k] > prev_end)
1286 : {
1287 156 : limit = endpts[k] - 1;
1288 156 : if (limit > prev_end ||
1289 LBC 0 : (k < min_matches && min_matches - k >= end - prev_end))
1290 : {
1291 ECB : /* break out of backtrack loop, continue the outer one */
1292 : break;
1293 : }
1294 : }
1295 : /* can't shorten k'th sub-match any more, consider previous one */
1296 LBC 0 : k--;
1297 EUB : }
1298 : }
1299 :
1300 : /* all possibilities exhausted */
1301 GIC 84 : FREE(endpts);
1302 :
1303 : /*
1304 EUB : * Now consider the possibility that we can match to a zero-length string
1305 : * by using zero repetitions.
1306 : */
1307 GIC 84 : if (t->min == 0 && begin == end)
1308 : {
1309 ECB : MDEBUG(("%d: allowing zero matches\n", t->id));
1310 GIC 56 : return REG_OKAY;
1311 : }
1312 :
1313 : MDEBUG(("%d: failed\n", t->id));
1314 28 : return REG_NOMATCH;
1315 ECB : }
1316 :
1317 : /*
1318 : * creviterdissect - dissect match for iteration node, shortest-first
1319 : */
1320 : static int /* regexec return code */
1321 GIC 7 : creviterdissect(struct vars *v,
1322 ECB : struct subre *t,
1323 : chr *begin, /* beginning of relevant substring */
1324 : chr *end) /* end of same */
1325 : {
1326 : struct dfa *d;
1327 : chr **endpts;
1328 : chr *limit;
1329 : int min_matches;
1330 : size_t max_matches;
1331 : int nverified;
1332 : int k;
1333 : int i;
1334 : int er;
1335 :
1336 GIC 7 : assert(t->op == '*');
1337 7 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1338 7 : assert(t->child->flags & SHORTER);
1339 7 : assert(begin <= end);
1340 :
1341 : MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1342 :
1343 : /*
1344 ECB : * If zero matches are allowed, and target string is empty, just declare
1345 : * victory. OTOH, if target string isn't empty, zero matches can't work
1346 : * so we pretend the min is 1.
1347 : */
1348 GIC 7 : min_matches = t->min;
1349 7 : if (min_matches <= 0)
1350 : {
1351 7 : if (begin == end)
1352 : {
1353 : MDEBUG(("%d: allowing zero matches\n", t->id));
1354 1 : return REG_OKAY;
1355 : }
1356 CBC 6 : min_matches = 1;
1357 ECB : }
1358 :
1359 : /*
1360 : * We need workspace to track the endpoints of each sub-match. Normally
1361 : * we consider only nonzero-length sub-matches, so there can be at most
1362 : * end-begin of them. However, if min is larger than that, we will also
1363 : * consider zero-length sub-matches in order to find enough matches.
1364 : *
1365 : * For convenience, endpts[0] contains the "begin" pointer and we store
1366 : * sub-match endpoints in endpts[1..max_matches].
1367 : */
1368 GIC 6 : max_matches = end - begin;
1369 6 : if (max_matches > t->max && t->max != DUPINF)
1370 6 : max_matches = t->max;
1371 6 : if (max_matches < min_matches)
1372 UIC 0 : max_matches = min_matches;
1373 GIC 6 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1374 6 : if (endpts == NULL)
1375 UIC 0 : return REG_ESPACE;
1376 CBC 6 : endpts[0] = begin;
1377 ECB :
1378 CBC 6 : d = getsubdfa(v, t->child);
1379 6 : if (ISERR())
1380 EUB : {
1381 LBC 0 : FREE(endpts);
1382 0 : return v->err;
1383 EUB : }
1384 ECB :
1385 : /*
1386 : * Our strategy is to first find a set of sub-match endpoints that are
1387 : * valid according to the child node's DFA, and then recursively dissect
1388 : * each sub-match to confirm validity. If any validity check fails,
1389 EUB : * backtrack that sub-match and try again. And, when we next try for a
1390 : * validity check, we need not recheck any successfully verified
1391 : * sub-matches that we didn't move the endpoints of. nverified remembers
1392 : * how many sub-matches are currently known okay.
1393 : */
1394 :
1395 : /* initialize to consider first sub-match */
1396 GIC 6 : nverified = 0;
1397 6 : k = 1;
1398 6 : limit = begin;
1399 :
1400 : /* iterate until satisfaction or failure */
1401 8 : while (k > 0)
1402 : {
1403 : /* disallow zero-length match unless necessary to achieve min */
1404 CBC 6 : if (limit == endpts[k - 1] &&
1405 6 : limit != end &&
1406 LBC 0 : (k >= min_matches || min_matches - k < end - limit))
1407 GIC 6 : limit++;
1408 :
1409 ECB : /* if this is the last allowed sub-match, it must reach to the end */
1410 GIC 6 : if (k >= max_matches)
1411 6 : limit = end;
1412 ECB :
1413 : /* try to find an endpoint for the k'th sub-match */
1414 GBC 6 : endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1415 ECB : (chr **) NULL, (int *) NULL);
1416 GIC 6 : if (ISERR())
1417 : {
1418 LBC 0 : FREE(endpts);
1419 0 : return v->err;
1420 : }
1421 GIC 6 : if (endpts[k] == NULL)
1422 ECB : {
1423 : /* no match possible, so see if we can lengthen previous one */
1424 LBC 0 : k--;
1425 UIC 0 : goto backtrack;
1426 EUB : }
1427 : MDEBUG(("%d: working endpoint %d: %ld\n",
1428 : t->id, k, LOFF(endpts[k])));
1429 ECB :
1430 : /* k'th sub-match can no longer be considered verified */
1431 GIC 6 : if (nverified >= k)
1432 UBC 0 : nverified = k - 1;
1433 EUB :
1434 GIC 6 : if (endpts[k] != end)
1435 : {
1436 : /* haven't reached end yet, try another iteration if allowed */
1437 UIC 0 : if (k >= max_matches)
1438 : {
1439 ECB : /* must try to lengthen some previous match */
1440 UBC 0 : k--;
1441 UIC 0 : goto backtrack;
1442 ECB : }
1443 :
1444 UIC 0 : k++;
1445 UBC 0 : limit = endpts[k - 1];
1446 UIC 0 : continue;
1447 : }
1448 EUB :
1449 : /*
1450 : * We've identified a way to divide the string into k sub-matches that
1451 : * works so far as the child DFA can tell. If k is an allowed number
1452 : * of matches, start the slow part: recurse to verify each sub-match.
1453 : * We always have k <= max_matches, needn't check that.
1454 : */
1455 GIC 6 : if (k < min_matches)
1456 UIC 0 : goto backtrack;
1457 :
1458 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1459 :
1460 GIC 10 : for (i = nverified + 1; i <= k; i++)
1461 : {
1462 : /* zap any match data from a non-last iteration */
1463 CBC 6 : zaptreesubs(v, t->child);
1464 GBC 6 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
1465 GIC 6 : if (er == REG_OKAY)
1466 : {
1467 4 : nverified = i;
1468 CBC 4 : continue;
1469 : }
1470 GIC 2 : if (er == REG_NOMATCH)
1471 CBC 2 : break;
1472 ECB : /* oops, something failed */
1473 LBC 0 : FREE(endpts);
1474 UIC 0 : return er;
1475 ECB : }
1476 :
1477 GIC 6 : if (i > k)
1478 ECB : {
1479 : /* satisfaction */
1480 : MDEBUG(("%d: successful\n", t->id));
1481 GBC 4 : FREE(endpts);
1482 4 : return REG_OKAY;
1483 : }
1484 :
1485 ECB : /* i'th match failed to verify, so backtrack it */
1486 GIC 2 : k = i;
1487 :
1488 2 : backtrack:
1489 ECB :
1490 : /*
1491 : * Must consider longer versions of the k'th sub-match.
1492 : */
1493 GIC 4 : while (k > 0)
1494 ECB : {
1495 GIC 2 : if (endpts[k] < end)
1496 ECB : {
1497 UIC 0 : limit = endpts[k] + 1;
1498 : /* break out of backtrack loop, continue the outer one */
1499 0 : break;
1500 : }
1501 ECB : /* can't lengthen k'th sub-match any more, consider previous one */
1502 GIC 2 : k--;
1503 ECB : }
1504 : }
1505 EUB :
1506 : /* all possibilities exhausted */
1507 : MDEBUG(("%d: failed\n", t->id));
1508 GIC 2 : FREE(endpts);
1509 2 : return REG_NOMATCH;
1510 ECB : }
1511 :
1512 :
1513 :
1514 : #include "rege_dfa.c"
|