Age Owner TLA Line data Source code
1 : /*
2 : * DFA routines
3 : * This file is #included by regexec.c.
4 : *
5 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 : *
7 : * Development of this software was funded, in part, by Cray Research Inc.,
8 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 : * Corporation, none of whom are responsible for the results. The author
10 : * thanks all of them.
11 : *
12 : * Redistribution and use in source and binary forms -- with or without
13 : * modification -- are permitted for any purpose, provided that
14 : * redistributions in source form retain this entire copyright notice and
15 : * indicate the origin and nature of any modifications.
16 : *
17 : * I'd appreciate being given credit for this package in the documentation
18 : * of software which uses it, but that is not a requirement.
19 : *
20 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * src/backend/regex/rege_dfa.c
32 : *
33 : */
34 :
35 : /*
36 : * longest - longest-preferred matching engine
37 : *
38 : * On success, returns match endpoint address. Returns NULL on no match.
39 : * Internal errors also return NULL, with v->err set.
40 : */
41 : static chr *
2118 tgl 42 CBC 468763 : longest(struct vars *v,
43 : struct dfa *d,
44 : chr *start, /* where the match should start */
45 : chr *stop, /* match must end at or before here */
46 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
47 : {
48 : chr *cp;
7188 bruce 49 468763 : chr *realstop = (stop == v->stop) ? stop : stop + 1;
50 : color co;
51 : struct sset *css;
52 : struct sset *ss;
53 : chr *post;
54 : int i;
7368 tgl 55 468763 : struct colormap *cm = d->cm;
56 :
57 : /* prevent "uninitialized variable" warnings */
2746 58 468763 : if (hitstopp != NULL)
59 451848 : *hitstopp = 0;
60 :
61 : /* if this is a backref to a known string, just match against that */
768 62 468763 : if (d->backno >= 0)
63 : {
64 793 : assert((size_t) d->backno < v->nmatch);
65 793 : if (v->pmatch[d->backno].rm_so >= 0)
66 : {
67 616 : cp = dfa_backref(v, d, start, start, stop, false);
68 616 : if (cp == v->stop && stop == v->stop && hitstopp != NULL)
768 tgl 69 UBC 0 : *hitstopp = 1;
768 tgl 70 CBC 616 : return cp;
71 : }
72 : }
73 :
74 : /* fast path for matchall NFAs */
778 75 468147 : if (d->cnfa->flags & MATCHALL)
76 : {
77 2495 : size_t nchr = stop - start;
78 2495 : size_t maxmatchall = d->cnfa->maxmatchall;
79 :
80 2495 : if (nchr < d->cnfa->minmatchall)
81 165 : return NULL;
82 2330 : if (maxmatchall == DUPINF)
83 : {
84 1488 : if (stop == v->stop && hitstopp != NULL)
85 5 : *hitstopp = 1;
86 : }
87 : else
88 : {
89 842 : if (stop == v->stop && nchr <= maxmatchall + 1 && hitstopp != NULL)
90 84 : *hitstopp = 1;
91 842 : if (nchr > maxmatchall)
92 572 : return start + maxmatchall;
93 : }
94 1758 : return stop;
95 : }
96 :
97 : /* initialize */
7368 98 465652 : css = initialize(v, d, start);
2746 99 465652 : if (css == NULL)
2746 tgl 100 UBC 0 : return NULL;
7368 tgl 101 CBC 465652 : cp = start;
102 :
103 : /* startup */
104 : FDEBUG(("+++ startup +++\n"));
7188 bruce 105 465652 : if (cp == v->start)
106 : {
107 1951 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
108 : FDEBUG(("color %ld\n", (long) co));
109 : }
110 : else
111 : {
7368 tgl 112 463701 : co = GETCOLOR(cm, *(cp - 1));
113 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
114 : }
115 465652 : css = miss(v, d, css, co, cp, start);
116 465652 : if (css == NULL)
117 216 : return NULL;
118 465436 : css->lastseen = cp;
119 :
120 : /*
121 : * This is the main text-scanning loop. It seems worth having two copies
122 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
123 : * builds, when you're not actively tracing.
124 : */
125 : #ifdef REG_DEBUG
126 : if (v->eflags & REG_FTRACE)
127 : {
128 : while (cp < realstop)
129 : {
130 : FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
131 : co = GETCOLOR(cm, *cp);
132 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
133 : ss = css->outs[co];
134 : if (ss == NULL)
135 : {
136 : ss = miss(v, d, css, co, cp + 1, start);
137 : if (ss == NULL)
138 : break; /* NOTE BREAK OUT */
139 : }
140 : cp++;
141 : ss->lastseen = cp;
142 : css = ss;
143 : }
144 : }
145 : else
146 : #endif
147 : {
7188 bruce 148 5922947 : while (cp < realstop)
149 : {
7368 tgl 150 5910485 : co = GETCOLOR(cm, *cp);
151 5910485 : ss = css->outs[co];
7188 bruce 152 5910485 : if (ss == NULL)
153 : {
154 1483789 : ss = miss(v, d, css, co, cp + 1, start);
7368 tgl 155 1483789 : if (ss == NULL)
7188 bruce 156 452974 : break; /* NOTE BREAK OUT */
157 : }
7368 tgl 158 5457511 : cp++;
159 5457511 : ss->lastseen = cp;
160 5457511 : css = ss;
161 : }
162 : }
163 :
2746 164 465436 : if (ISERR())
2746 tgl 165 UBC 0 : return NULL;
166 :
167 : /* shutdown */
168 : FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
7188 bruce 169 CBC 465436 : if (cp == v->stop && stop == v->stop)
170 : {
7368 tgl 171 6546 : if (hitstopp != NULL)
172 3091 : *hitstopp = 1;
7188 bruce 173 6546 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
174 : FDEBUG(("color %ld\n", (long) co));
7368 tgl 175 6546 : ss = miss(v, d, css, co, cp, start);
2746 176 6546 : if (ISERR())
2746 tgl 177 UBC 0 : return NULL;
178 : /* special case: match ended at eol? */
7188 bruce 179 CBC 6546 : if (ss != NULL && (ss->flags & POSTSTATE))
7368 tgl 180 3501 : return cp;
181 3045 : else if (ss != NULL)
7368 tgl 182 UBC 0 : ss->lastseen = cp; /* to be tidy */
183 : }
184 :
185 : /* find last match, if any */
7368 tgl 186 CBC 461935 : post = d->lastpost;
187 2393081 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
7188 bruce 188 1931146 : if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
189 7210 : (post == NULL || post < ss->lastseen))
7368 tgl 190 466755 : post = ss->lastseen;
7188 bruce 191 461935 : if (post != NULL) /* found one */
7368 tgl 192 459927 : return post - 1;
193 :
194 2008 : return NULL;
195 : }
196 :
197 : /*
198 : * shortest - shortest-preferred matching engine
199 : *
200 : * On success, returns match endpoint address. Returns NULL on no match.
201 : * Internal errors also return NULL, with v->err set.
202 : */
203 : static chr *
2118 204 902023 : shortest(struct vars *v,
205 : struct dfa *d,
206 : chr *start, /* where the match should start */
207 : chr *min, /* match must end at or after here */
208 : chr *max, /* match must end at or before here */
209 : chr **coldp, /* store coldstart pointer here, if non-NULL */
210 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
211 : {
212 : chr *cp;
7188 bruce 213 902023 : chr *realmin = (min == v->stop) ? min : min + 1;
214 902023 : chr *realmax = (max == v->stop) ? max : max + 1;
215 : color co;
216 : struct sset *css;
217 : struct sset *ss;
7368 tgl 218 902023 : struct colormap *cm = d->cm;
219 :
220 : /* prevent "uninitialized variable" warnings */
2746 221 902023 : if (coldp != NULL)
222 901194 : *coldp = NULL;
223 902023 : if (hitstopp != NULL)
224 158 : *hitstopp = 0;
225 :
226 : /* if this is a backref to a known string, just match against that */
768 227 902023 : if (d->backno >= 0)
228 : {
768 tgl 229 UBC 0 : assert((size_t) d->backno < v->nmatch);
230 0 : if (v->pmatch[d->backno].rm_so >= 0)
231 : {
232 0 : cp = dfa_backref(v, d, start, min, max, true);
233 0 : if (cp != NULL && coldp != NULL)
234 0 : *coldp = start;
235 : /* there is no case where we should set *hitstopp */
236 0 : return cp;
237 : }
238 : }
239 :
240 : /* fast path for matchall NFAs */
778 tgl 241 CBC 902023 : if (d->cnfa->flags & MATCHALL)
242 : {
243 986 : size_t nchr = min - start;
244 :
245 986 : if (d->cnfa->maxmatchall != DUPINF &&
246 12 : nchr > d->cnfa->maxmatchall)
778 tgl 247 UBC 0 : return NULL;
778 tgl 248 CBC 986 : if ((max - start) < d->cnfa->minmatchall)
249 9 : return NULL;
250 977 : if (nchr < d->cnfa->minmatchall)
251 65 : min = start + d->cnfa->minmatchall;
252 977 : if (coldp != NULL)
253 447 : *coldp = start;
254 : /* there is no case where we should set *hitstopp */
255 977 : return min;
256 : }
257 :
258 : /* initialize */
7368 259 901037 : css = initialize(v, d, start);
2746 260 901037 : if (css == NULL)
2746 tgl 261 UBC 0 : return NULL;
7368 tgl 262 CBC 901037 : cp = start;
263 :
264 : /* startup */
265 : FDEBUG(("--- startup ---\n"));
7188 bruce 266 901037 : if (cp == v->start)
267 : {
268 453417 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
269 : FDEBUG(("color %ld\n", (long) co));
270 : }
271 : else
272 : {
7368 tgl 273 447620 : co = GETCOLOR(cm, *(cp - 1));
274 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
275 : }
276 901037 : css = miss(v, d, css, co, cp, start);
277 901037 : if (css == NULL)
278 6 : return NULL;
279 901031 : css->lastseen = cp;
280 901031 : ss = css;
281 :
282 : /*
283 : * This is the main text-scanning loop. It seems worth having two copies
284 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
285 : * builds, when you're not actively tracing.
286 : */
287 : #ifdef REG_DEBUG
288 : if (v->eflags & REG_FTRACE)
289 : {
290 : while (cp < realmax)
291 : {
292 : FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
293 : co = GETCOLOR(cm, *cp);
294 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
295 : ss = css->outs[co];
296 : if (ss == NULL)
297 : {
298 : ss = miss(v, d, css, co, cp + 1, start);
299 : if (ss == NULL)
300 : break; /* NOTE BREAK OUT */
301 : }
302 : cp++;
303 : ss->lastseen = cp;
304 : css = ss;
305 : if ((ss->flags & POSTSTATE) && cp >= realmin)
306 : break; /* NOTE BREAK OUT */
307 : }
308 : }
309 : else
310 : #endif
311 : {
7188 bruce 312 18093658 : while (cp < realmax)
313 : {
7368 tgl 314 17850289 : co = GETCOLOR(cm, *cp);
315 17850289 : ss = css->outs[co];
7188 bruce 316 17850289 : if (ss == NULL)
317 : {
318 3123250 : ss = miss(v, d, css, co, cp + 1, start);
7368 tgl 319 3123250 : if (ss == NULL)
7188 bruce 320 96412 : break; /* NOTE BREAK OUT */
321 : }
7368 tgl 322 17753877 : cp++;
323 17753877 : ss->lastseen = cp;
324 17753877 : css = ss;
7188 bruce 325 17753877 : if ((ss->flags & POSTSTATE) && cp >= realmin)
326 561250 : break; /* NOTE BREAK OUT */
327 : }
328 : }
329 :
7368 tgl 330 901031 : if (ss == NULL)
331 96412 : return NULL;
332 :
6385 bruce 333 804619 : if (coldp != NULL) /* report last no-progress state set, if any */
7368 tgl 334 804352 : *coldp = lastcold(v, d);
335 :
7188 bruce 336 804619 : if ((ss->flags & POSTSTATE) && cp > min)
337 : {
7368 tgl 338 561234 : assert(cp >= realmin);
339 561234 : cp--;
340 : }
7188 bruce 341 243385 : else if (cp == v->stop && max == v->stop)
342 : {
343 243385 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
344 : FDEBUG(("color %ld\n", (long) co));
7368 tgl 345 243385 : ss = miss(v, d, css, co, cp, start);
346 : /* match might have ended at eol */
7188 bruce 347 243385 : if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
7368 tgl 348 6 : *hitstopp = 1;
349 : }
350 :
7188 bruce 351 804619 : if (ss == NULL || !(ss->flags & POSTSTATE))
7368 tgl 352 234384 : return NULL;
353 :
354 570235 : return cp;
355 : }
356 :
357 : /*
358 : * matchuntil - incremental matching engine
359 : *
360 : * This is meant for use with a search-style NFA (that is, the pattern is
361 : * known to act as though it had a leading .*). We determine whether a
362 : * match exists starting at v->start and ending at probe. Multiple calls
363 : * require only O(N) time not O(N^2) so long as the probe values are
364 : * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
365 : * starting a series of calls.
366 : *
367 : * Returns 1 if a match exists, 0 if not.
368 : * Internal errors also return 0, with v->err set.
369 : */
370 : static int
2118 371 60 : matchuntil(struct vars *v,
372 : struct dfa *d,
373 : chr *probe, /* we want to know if a match ends here */
374 : struct sset **lastcss, /* state storage across calls */
375 : chr **lastcp) /* state storage across calls */
376 : {
2718 377 60 : chr *cp = *lastcp;
378 : color co;
379 60 : struct sset *css = *lastcss;
380 : struct sset *ss;
381 60 : struct colormap *cm = d->cm;
382 :
383 : /* fast path for matchall NFAs */
778 384 60 : if (d->cnfa->flags & MATCHALL)
385 : {
386 18 : size_t nchr = probe - v->start;
387 :
388 18 : if (nchr < d->cnfa->minmatchall)
389 9 : return 0;
390 : /* maxmatchall will always be infinity, cf. makesearch() */
590 391 9 : assert(d->cnfa->maxmatchall == DUPINF);
778 392 9 : return 1;
393 : }
394 :
395 : /* initialize and startup, or restart, if necessary */
2718 396 42 : if (cp == NULL || cp > probe)
397 : {
398 12 : cp = v->start;
399 12 : css = initialize(v, d, cp);
400 12 : if (css == NULL)
2718 tgl 401 UBC 0 : return 0;
402 :
403 : FDEBUG((">>> startup >>>\n"));
2718 tgl 404 CBC 12 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
405 : FDEBUG(("color %ld\n", (long) co));
406 :
407 12 : css = miss(v, d, css, co, cp, v->start);
408 12 : if (css == NULL)
2718 tgl 409 UBC 0 : return 0;
2718 tgl 410 CBC 12 : css->lastseen = cp;
411 : }
412 30 : else if (css == NULL)
413 : {
414 : /* we previously found that no match is possible beyond *lastcp */
2718 tgl 415 UBC 0 : return 0;
416 : }
2718 tgl 417 CBC 42 : ss = css;
418 :
419 : /*
420 : * This is the main text-scanning loop. It seems worth having two copies
421 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
422 : * builds, when you're not actively tracing.
423 : */
424 : #ifdef REG_DEBUG
425 : if (v->eflags & REG_FTRACE)
426 : {
427 : while (cp < probe)
428 : {
429 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
430 : co = GETCOLOR(cm, *cp);
431 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
432 : ss = css->outs[co];
433 : if (ss == NULL)
434 : {
435 : ss = miss(v, d, css, co, cp + 1, v->start);
436 : if (ss == NULL)
437 : break; /* NOTE BREAK OUT */
438 : }
439 : cp++;
440 : ss->lastseen = cp;
441 : css = ss;
442 : }
443 : }
444 : else
445 : #endif
446 : {
447 90 : while (cp < probe)
448 : {
449 48 : co = GETCOLOR(cm, *cp);
450 48 : ss = css->outs[co];
451 48 : if (ss == NULL)
452 : {
453 6 : ss = miss(v, d, css, co, cp + 1, v->start);
454 6 : if (ss == NULL)
2718 tgl 455 UBC 0 : break; /* NOTE BREAK OUT */
456 : }
2718 tgl 457 CBC 48 : cp++;
458 48 : ss->lastseen = cp;
459 48 : css = ss;
460 : }
461 : }
462 :
463 42 : *lastcss = ss;
464 42 : *lastcp = cp;
465 :
466 42 : if (ss == NULL)
2718 tgl 467 UBC 0 : return 0; /* impossible match, or internal error */
468 :
469 : /* We need to process one more chr, or the EOS symbol, to check match */
2718 tgl 470 CBC 42 : if (cp < v->stop)
471 : {
472 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
473 42 : co = GETCOLOR(cm, *cp);
474 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
475 42 : ss = css->outs[co];
476 42 : if (ss == NULL)
477 27 : ss = miss(v, d, css, co, cp + 1, v->start);
478 : }
479 : else
480 : {
2718 tgl 481 UBC 0 : assert(cp == v->stop);
482 0 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
483 : FDEBUG(("color %ld\n", (long) co));
484 0 : ss = miss(v, d, css, co, cp, v->start);
485 : }
486 :
2718 tgl 487 CBC 42 : if (ss == NULL || !(ss->flags & POSTSTATE))
488 30 : return 0;
489 :
490 12 : return 1;
491 : }
492 :
493 : /*
494 : * dfa_backref - find best match length for a known backref string
495 : *
496 : * When the backref's referent is already available, we can deliver an exact
497 : * answer with considerably less work than running the backref node's NFA.
498 : *
499 : * Return match endpoint for longest or shortest valid repeated match,
500 : * or NULL if there is no valid match.
501 : *
502 : * Should be in sync with cbrdissect(), although that has the different task
503 : * of checking a match to a predetermined section of the string.
504 : */
505 : static chr *
768 506 616 : dfa_backref(struct vars *v,
507 : struct dfa *d,
508 : chr *start, /* where the match should start */
509 : chr *min, /* match must end at or after here */
510 : chr *max, /* match must end at or before here */
511 : bool shortest)
512 : {
513 616 : int n = d->backno;
514 616 : int backmin = d->backmin;
515 616 : int backmax = d->backmax;
516 : size_t numreps;
517 : size_t minreps;
518 : size_t maxreps;
519 : size_t brlen;
520 : chr *brstring;
521 : chr *p;
522 :
523 : /* get the backreferenced string (caller should have checked this) */
524 616 : if (v->pmatch[n].rm_so == -1)
768 tgl 525 UBC 0 : return NULL;
768 tgl 526 CBC 616 : brstring = v->start + v->pmatch[n].rm_so;
527 616 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
528 :
529 : /* special-case zero-length backreference to avoid divide by zero */
530 616 : if (brlen == 0)
531 : {
532 : /*
533 : * matches only a zero-length string, but any number of repetitions
534 : * can be considered to be present
535 : */
536 1 : if (min == start && backmin <= backmax)
537 1 : return start;
768 tgl 538 UBC 0 : return NULL;
539 : }
540 :
541 : /*
542 : * convert min and max into numbers of possible repetitions of the backref
543 : * string, rounding appropriately
544 : */
768 tgl 545 CBC 615 : if (min <= start)
546 615 : minreps = 0;
547 : else
768 tgl 548 UBC 0 : minreps = (min - start - 1) / brlen + 1;
768 tgl 549 CBC 615 : maxreps = (max - start) / brlen;
550 :
551 : /* apply bounds, then see if there is any allowed match length */
552 615 : if (minreps < backmin)
553 597 : minreps = backmin;
554 615 : if (backmax != DUPINF && maxreps > backmax)
555 297 : maxreps = backmax;
556 615 : if (maxreps < minreps)
557 134 : return NULL;
558 :
559 : /* quick exit if zero-repetitions match is valid and preferred */
560 481 : if (shortest && minreps == 0)
768 tgl 561 UBC 0 : return start;
562 :
563 : /* okay, compare the actual string contents */
768 tgl 564 CBC 481 : p = start;
565 481 : numreps = 0;
566 570 : while (numreps < maxreps)
567 : {
568 492 : if ((*v->g->compare) (brstring, p, brlen) != 0)
569 403 : break;
570 89 : p += brlen;
571 89 : numreps++;
572 89 : if (shortest && numreps >= minreps)
768 tgl 573 UBC 0 : break;
574 : }
575 :
768 tgl 576 CBC 481 : if (numreps >= minreps)
577 82 : return p;
578 399 : return NULL;
579 : }
580 :
581 : /*
582 : * lastcold - determine last point at which no progress had been made
583 : */
584 : static chr * /* endpoint, or NULL */
2118 585 804352 : lastcold(struct vars *v,
586 : struct dfa *d)
587 : {
588 : struct sset *ss;
589 : chr *nopr;
590 : int i;
591 :
7368 592 804352 : nopr = d->lastnopr;
593 804352 : if (nopr == NULL)
594 804350 : nopr = v->start;
595 4282624 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
7188 bruce 596 3478272 : if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
7368 tgl 597 1171118 : nopr = ss->lastseen;
598 804352 : return nopr;
599 : }
600 :
601 : /*
602 : * newdfa - set up a fresh DFA
603 : *
604 : * Returns NULL (and sets v->err) on failure.
605 : */
606 : static struct dfa *
2118 607 1365075 : newdfa(struct vars *v,
608 : struct cnfa *cnfa,
609 : struct colormap *cm,
610 : struct smalldfa *sml) /* preallocated space, may be NULL */
611 : {
612 : struct dfa *d;
7188 bruce 613 1365075 : size_t nss = cnfa->nstates * 2;
614 1365075 : int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
777 tgl 615 1365075 : bool ismalloced = false;
616 :
7368 617 1365075 : assert(cnfa != NULL && cnfa->nstates != 0);
618 :
7188 bruce 619 1365075 : if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
620 : {
7368 tgl 621 1261589 : assert(wordsper == 1);
4069 622 1261589 : if (sml == NULL)
623 : {
624 11653 : sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
625 11653 : if (sml == NULL)
626 : {
7368 tgl 627 UBC 0 : ERR(REG_ESPACE);
628 0 : return NULL;
629 : }
777 tgl 630 CBC 11653 : ismalloced = true;
631 : }
4069 632 1261589 : d = &sml->dfa;
633 1261589 : d->ssets = sml->ssets;
634 1261589 : d->statesarea = sml->statesarea;
7368 635 1261589 : d->work = &d->statesarea[nss];
4069 636 1261589 : d->outsarea = sml->outsarea;
637 1261589 : d->incarea = sml->incarea;
777 638 1261589 : d->ismalloced = ismalloced;
639 1261589 : d->arraysmalloced = false; /* not separately allocated, anyway */
640 : }
641 : else
642 : {
7188 bruce 643 103486 : d = (struct dfa *) MALLOC(sizeof(struct dfa));
644 103486 : if (d == NULL)
645 : {
7368 tgl 646 UBC 0 : ERR(REG_ESPACE);
647 0 : return NULL;
648 : }
7188 bruce 649 CBC 103486 : d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
650 103486 : d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
651 : sizeof(unsigned));
7368 tgl 652 103486 : d->work = &d->statesarea[nss * wordsper];
7188 bruce 653 103486 : d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
654 : sizeof(struct sset *));
655 103486 : d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
656 : sizeof(struct arcp));
777 tgl 657 103486 : d->ismalloced = true;
658 103486 : d->arraysmalloced = true;
659 : /* now freedfa() will behave sanely */
7368 660 103486 : if (d->ssets == NULL || d->statesarea == NULL ||
7188 bruce 661 103486 : d->outsarea == NULL || d->incarea == NULL)
662 : {
7368 tgl 663 UBC 0 : freedfa(d);
664 0 : ERR(REG_ESPACE);
665 0 : return NULL;
666 : }
667 : }
668 :
7188 bruce 669 CBC 1365075 : d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
7368 tgl 670 1365075 : d->nssused = 0;
671 1365075 : d->nstates = cnfa->nstates;
672 1365075 : d->ncolors = cnfa->ncolors;
673 1365075 : d->wordsper = wordsper;
674 1365075 : d->cnfa = cnfa;
675 1365075 : d->cm = cm;
676 1365075 : d->lastpost = NULL;
677 1365075 : d->lastnopr = NULL;
678 1365075 : d->search = d->ssets;
768 679 1365075 : d->backno = -1; /* may be set by caller */
680 1365075 : d->backmin = d->backmax = 0;
681 :
682 : /* initialization of sset fields is done as needed */
683 :
7368 684 1365075 : return d;
685 : }
686 :
687 : /*
688 : * freedfa - free a DFA
689 : */
690 : static void
2118 691 1365075 : freedfa(struct dfa *d)
692 : {
777 693 1365075 : if (d->arraysmalloced)
694 : {
7368 695 103486 : if (d->ssets != NULL)
696 103486 : FREE(d->ssets);
697 103486 : if (d->statesarea != NULL)
698 103486 : FREE(d->statesarea);
699 103486 : if (d->outsarea != NULL)
700 103486 : FREE(d->outsarea);
701 103486 : if (d->incarea != NULL)
702 103486 : FREE(d->incarea);
703 : }
704 :
777 705 1365075 : if (d->ismalloced)
706 115139 : FREE(d);
7368 707 1365075 : }
708 :
709 : /*
710 : * hash - construct a hash code for a bitvector
711 : *
712 : * There are probably better ways, but they're more expensive.
713 : */
714 : static unsigned
715 15519 : hash(unsigned *uv,
716 : int n)
717 : {
718 : int i;
719 : unsigned h;
720 :
721 15519 : h = 0;
722 74517 : for (i = 0; i < n; i++)
723 58998 : h ^= uv[i];
724 15519 : return h;
725 : }
726 :
727 : /*
728 : * initialize - hand-craft a cache entry for startup, otherwise get ready
729 : */
730 : static struct sset *
2118 731 1366701 : initialize(struct vars *v,
732 : struct dfa *d,
733 : chr *start)
734 : {
735 : struct sset *ss;
736 : int i;
737 :
738 : /* is previous one still there? */
7188 bruce 739 1366701 : if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
7368 tgl 740 2936 : ss = &d->ssets[0];
741 : else
742 : { /* no, must (re)build it */
743 1363765 : ss = getvacant(v, d, start, start);
2746 744 1363765 : if (ss == NULL)
2746 tgl 745 UBC 0 : return NULL;
7368 tgl 746 CBC 2728754 : for (i = 0; i < d->wordsper; i++)
747 1364989 : ss->states[i] = 0;
748 1363765 : BSET(ss->states, d->cnfa->pre);
749 1363765 : ss->hash = HASH(ss->states, d->wordsper);
750 1363765 : assert(d->cnfa->pre != d->cnfa->post);
7188 bruce 751 1363765 : ss->flags = STARTER | LOCKED | NOPROGRESS;
752 : /* lastseen dealt with below */
753 : }
754 :
7368 tgl 755 2771074 : for (i = 0; i < d->nssused; i++)
756 1404373 : d->ssets[i].lastseen = NULL;
757 1366701 : ss->lastseen = start; /* maybe untrue, but harmless */
758 1366701 : d->lastpost = NULL;
759 1366701 : d->lastnopr = NULL;
760 1366701 : return ss;
761 : }
762 :
763 : /*
764 : * miss - handle a stateset cache miss
765 : *
766 : * css is the current stateset, co is the color of the current input character,
767 : * cp points to the character after that (which is where we may need to test
768 : * LACONs). start does not affect matching behavior but is needed for pickss'
769 : * heuristics about which stateset cache entry to replace.
770 : *
771 : * Ordinarily, returns the address of the next stateset (the one that is
772 : * valid after consuming the input character). Returns NULL if no valid
773 : * NFA states remain, ie we have a certain match failure.
774 : * Internal errors also return NULL, with v->err set.
775 : */
776 : static struct sset *
2118 777 6223704 : miss(struct vars *v,
778 : struct dfa *d,
779 : struct sset *css,
780 : color co,
781 : chr *cp, /* next chr */
782 : chr *start) /* where the attempt got started */
783 : {
7368 784 6223704 : struct cnfa *cnfa = d->cnfa;
785 : int i;
786 : unsigned h;
787 : struct carc *ca;
788 : struct sset *p;
789 : int ispseudocolor;
790 : int ispost;
791 : int noprogress;
792 : int gotstate;
793 : int dolacons;
794 : int sawlacons;
795 :
796 : /* for convenience, we can be called even if it might not be a miss */
7188 bruce 797 6223704 : if (css->outs[co] != NULL)
798 : {
799 : FDEBUG(("hit\n"));
7368 tgl 800 2294 : return css->outs[co];
801 : }
802 : FDEBUG(("miss\n"));
803 :
804 : /*
805 : * Checking for operation cancel in the inner text search loop seems
806 : * unduly expensive. As a compromise, check during cache misses.
807 : */
1 tmunro 808 GNC 6221410 : INTERRUPT(v->re);
809 :
810 : /*
2746 tgl 811 ECB : * What set of states would we end up in after consuming the co character?
812 : * We first consider PLAIN arcs that consume the character, and then look
813 : * to see what LACON arcs could be traversed after consuming it.
814 : */
7368 tgl 815 CBC 12487813 : for (i = 0; i < d->wordsper; i++)
2746 816 6266403 : d->work[i] = 0; /* build new stateset bitmap in d->work */
778 817 6221410 : ispseudocolor = d->cm->cd[co].flags & PSEUDO;
7368 818 6221410 : ispost = 0;
819 6221410 : noprogress = 1;
820 6221410 : gotstate = 0;
821 47874095 : for (i = 0; i < d->nstates; i++)
7368 tgl 822 GIC 41652685 : if (ISBSET(css->states, i))
3928 tgl 823 CBC 43906243 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
778 824 33031975 : if (ca->co == co ||
825 30210464 : (ca->co == RAINBOW && !ispseudocolor))
7188 bruce 826 ECB : {
7368 tgl 827 CBC 11386580 : BSET(d->work, ca->to);
828 11386580 : gotstate = 1;
7368 tgl 829 GIC 11386580 : if (ca->to == cnfa->post)
830 1047233 : ispost = 1;
3928 tgl 831 CBC 11386580 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
7368 832 3753357 : noprogress = 0;
7368 tgl 833 ECB : FDEBUG(("%d -> %d\n", i, ca->to));
834 : }
2746 tgl 835 GIC 6221410 : if (!gotstate)
2746 tgl 836 CBC 787037 : return NULL; /* character cannot reach any new state */
2746 tgl 837 GIC 5434373 : dolacons = (cnfa->flags & HASLACONS);
7368 tgl 838 CBC 5434373 : sawlacons = 0;
2746 tgl 839 ECB : /* outer loop handles transitive closure of reachable-by-LACON states */
7188 bruce 840 CBC 5435175 : while (dolacons)
2746 tgl 841 ECB : {
7368 tgl 842 GIC 802 : dolacons = 0;
7368 tgl 843 CBC 7346 : for (i = 0; i < d->nstates; i++)
844 6544 : if (ISBSET(d->work, i))
3928 845 2600 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
7188 bruce 846 ECB : {
3928 tgl 847 CBC 1560 : if (ca->co < cnfa->ncolors)
2118 848 1374 : continue; /* not a LACON arc */
7368 tgl 849 GIC 186 : if (ISBSET(d->work, ca->to))
2118 tgl 850 CBC 66 : continue; /* arc would be a no-op anyway */
2118 tgl 851 GBC 120 : sawlacons = 1; /* this LACON affects our result */
7368 tgl 852 CBC 120 : if (!lacon(v, cnfa, cp, ca->co))
853 : {
2746 854 63 : if (ISERR())
2746 tgl 855 UBC 0 : return NULL;
2118 tgl 856 CBC 63 : continue; /* LACON arc cannot be traversed */
2746 tgl 857 ECB : }
2746 tgl 858 CBC 57 : if (ISERR())
2746 tgl 859 UBC 0 : return NULL;
7368 tgl 860 CBC 57 : BSET(d->work, ca->to);
861 57 : dolacons = 1;
7368 tgl 862 GIC 57 : if (ca->to == cnfa->post)
7368 tgl 863 UIC 0 : ispost = 1;
3928 tgl 864 GIC 57 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
7368 tgl 865 CBC 57 : noprogress = 0;
866 : FDEBUG(("%d :> %d\n", i, ca->to));
867 : }
7368 tgl 868 ECB : }
7368 tgl 869 CBC 5434373 : h = HASH(d->work, d->wordsper);
870 :
871 : /* Is this stateset already in the cache? */
7368 tgl 872 GIC 18931848 : for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
7188 bruce 873 14673285 : if (HIT(h, d->work, p, d->wordsper))
7188 bruce 874 ECB : {
875 : FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
876 : break; /* NOTE BREAK OUT */
7368 tgl 877 : }
7188 bruce 878 GBC 5434373 : if (i == 0)
7188 bruce 879 ECB : { /* nope, need a new cache entry */
7368 tgl 880 CBC 4258563 : p = getvacant(v, d, cp, start);
2746 881 4258563 : if (p == NULL)
2746 tgl 882 LBC 0 : return NULL;
7368 tgl 883 CBC 4258563 : assert(p != css);
884 8555004 : for (i = 0; i < d->wordsper; i++)
885 4296441 : p->states[i] = d->work[i];
7368 tgl 886 GIC 4258563 : p->hash = h;
887 4258563 : p->flags = (ispost) ? POSTSTATE : 0;
888 4258563 : if (noprogress)
889 1364765 : p->flags |= NOPROGRESS;
890 : /* lastseen to be dealt with by caller */
891 : }
892 :
893 : /*
894 : * Link new stateset to old, unless a LACON affected the result, in which
895 : * case we don't create the link. That forces future transitions across
2746 tgl 896 ECB : * this same arc (same prior stateset and character color) to come through
897 : * miss() again, so that we can recheck the LACON(s), which might or might
898 : * not pass since context will be different.
899 : */
7188 bruce 900 CBC 5434373 : if (!sawlacons)
2746 tgl 901 ECB : {
5664 902 : FDEBUG(("c%d[%d]->c%d\n",
903 : (int) (css - d->ssets), co, (int) (p - d->ssets)));
7368 tgl 904 GIC 5434285 : css->outs[co] = p;
7368 tgl 905 CBC 5434285 : css->inchain[co] = p->ins;
7368 tgl 906 GIC 5434285 : p->ins.ss = css;
2424 907 5434285 : p->ins.co = co;
908 : }
7368 909 5434373 : return p;
910 : }
911 :
7368 tgl 912 ECB : /*
913 : * lacon - lookaround-constraint checker for miss()
914 : */
915 : static int /* predicate: constraint satisfied? */
2118 tgl 916 GIC 120 : lacon(struct vars *v,
917 : struct cnfa *pcnfa, /* parent cnfa */
918 : chr *cp,
919 : color co) /* "color" of the lookaround constraint */
920 : {
921 : int n;
922 : struct subre *sub;
923 : struct dfa *d;
7188 bruce 924 ECB : chr *end;
925 : int satisfied;
7368 tgl 926 EUB :
2746 927 : /* Since this is recursive, it could be driven to stack overflow */
2746 tgl 928 GIC 120 : if (STACK_TOO_DEEP(v->re))
929 : {
2746 tgl 930 LBC 0 : ERR(REG_ETOOBIG);
931 0 : return 0;
932 : }
2746 tgl 933 ECB :
7368 tgl 934 CBC 120 : n = co - pcnfa->ncolors;
2718 935 120 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
7368 tgl 936 EUB : FDEBUG(("=== testing lacon %d\n", n));
7368 tgl 937 CBC 120 : sub = &v->g->lacons[n];
2718 tgl 938 GIC 120 : d = getladfa(v, n);
7188 bruce 939 120 : if (d == NULL)
7368 tgl 940 LBC 0 : return 0;
778 tgl 941 GIC 120 : if (LATYPE_IS_AHEAD(sub->latype))
2718 tgl 942 ECB : {
943 : /* used to use longest() here, but shortest() could be much cheaper */
2718 tgl 944 GIC 60 : end = shortest(v, d, cp, cp, v->stop,
945 : (chr **) NULL, (int *) NULL);
778 946 60 : satisfied = LATYPE_IS_POS(sub->latype) ? (end != NULL) : (end == NULL);
947 : }
948 : else
949 : {
950 : /*
951 : * To avoid doing O(N^2) work when repeatedly testing a lookbehind
952 : * constraint in an N-character string, we use matchuntil() which can
953 : * cache the DFA state across calls. We only need to restart if the
2718 tgl 954 ECB : * probe point decreases, which is not common. The NFA we're using is
955 : * a search NFA, so it doesn't mind scanning over stuff before the
2718 tgl 956 EUB : * nominal match.
957 : */
2718 tgl 958 GIC 60 : satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
778 tgl 959 CBC 60 : if (!LATYPE_IS_POS(sub->latype))
2718 tgl 960 UIC 0 : satisfied = !satisfied;
961 : }
962 : FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
2718 tgl 963 GIC 120 : return satisfied;
964 : }
965 :
966 : /*
967 : * getvacant - get a vacant state set
968 : *
7368 tgl 969 ECB : * This routine clears out the inarcs and outarcs, but does not otherwise
970 : * clear the innards of the state set -- that's up to the caller.
971 : */
972 : static struct sset *
2118 tgl 973 GIC 5622328 : getvacant(struct vars *v,
974 : struct dfa *d,
975 : chr *cp,
976 : chr *start)
977 : {
978 : int i;
979 : struct sset *ss;
7368 tgl 980 ECB : struct sset *p;
981 : struct arcp ap;
7188 bruce 982 EUB : color co;
7368 tgl 983 ECB :
7368 tgl 984 GIC 5622328 : ss = pickss(v, d, cp, start);
2746 985 5622328 : if (ss == NULL)
2746 tgl 986 LBC 0 : return NULL;
7188 bruce 987 CBC 5622328 : assert(!(ss->flags & LOCKED));
988 :
7368 tgl 989 ECB : /* clear out its inarcs, including self-referential ones */
7368 tgl 990 GIC 5622328 : ap = ss->ins;
7188 bruce 991 CBC 5622340 : while ((p = ap.ss) != NULL)
7188 bruce 992 ECB : {
7368 tgl 993 CBC 12 : co = ap.co;
994 : FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
995 12 : p->outs[co] = NULL;
7368 tgl 996 GIC 12 : ap = p->inchain[co];
2118 997 12 : p->inchain[co].ss = NULL; /* paranoia */
7368 tgl 998 ECB : }
7368 tgl 999 GIC 5622328 : ss->ins.ss = NULL;
7368 tgl 1000 ECB :
1001 : /* take it off the inarc chains of the ssets reached by its outarcs */
7188 bruce 1002 CBC 48615868 : for (i = 0; i < d->ncolors; i++)
7188 bruce 1003 ECB : {
7368 tgl 1004 GIC 42993540 : p = ss->outs[i];
7368 tgl 1005 CBC 42993540 : assert(p != ss); /* not self-referential */
1006 42993540 : if (p == NULL)
7188 bruce 1007 GIC 42993474 : continue; /* NOTE CONTINUE */
1008 : FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
7368 tgl 1009 CBC 66 : if (p->ins.ss == ss && p->ins.co == i)
7368 tgl 1010 GIC 60 : p->ins = ss->inchain[i];
7188 bruce 1011 ECB : else
1012 : {
6406 tgl 1013 CBC 6 : struct arcp lastap = {NULL, 0};
6406 tgl 1014 ECB :
7368 tgl 1015 CBC 6 : assert(p->ins.ss != NULL);
1016 12 : for (ap = p->ins; ap.ss != NULL &&
7188 bruce 1017 12 : !(ap.ss == ss && ap.co == i);
7188 bruce 1018 GIC 6 : ap = ap.ss->inchain[ap.co])
7368 tgl 1019 CBC 6 : lastap = ap;
1020 6 : assert(ap.ss != NULL);
7368 tgl 1021 GIC 6 : lastap.ss->inchain[lastap.co] = ss->inchain[i];
1022 : }
1023 66 : ss->outs[i] = NULL;
7368 tgl 1024 CBC 66 : ss->inchain[i].ss = NULL;
7368 tgl 1025 ECB : }
1026 :
1027 : /* if ss was a success state, may need to remember location */
7188 bruce 1028 GIC 5622328 : if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
7188 bruce 1029 CBC 18 : (d->lastpost == NULL || d->lastpost < ss->lastseen))
7368 tgl 1030 18 : d->lastpost = ss->lastseen;
7368 tgl 1031 ECB :
1032 : /* likewise for a no-progress state */
7188 bruce 1033 CBC 5622328 : if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
7188 bruce 1034 GIC 6 : (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
7368 tgl 1035 6 : d->lastnopr = ss->lastseen;
1036 :
1037 5622328 : return ss;
1038 : }
1039 :
7368 tgl 1040 ECB : /*
1041 : * pickss - pick the next stateset to be used
1042 : */
1043 : static struct sset *
2118 tgl 1044 GIC 5622328 : pickss(struct vars *v,
1045 : struct dfa *d,
1046 : chr *cp,
1047 : chr *start)
1048 : {
1049 : int i;
1050 : struct sset *ss;
7368 tgl 1051 ECB : struct sset *end;
1052 : chr *ancient;
1053 :
1054 : /* shortcut for cases where cache isn't full */
7188 bruce 1055 CBC 5622328 : if (d->nssused < d->nssets)
1056 : {
7368 tgl 1057 GIC 5622262 : i = d->nssused;
7368 tgl 1058 CBC 5622262 : d->nssused++;
1059 5622262 : ss = &d->ssets[i];
7368 tgl 1060 ECB : FDEBUG(("new c%d\n", i));
1061 : /* set up innards */
7368 tgl 1062 CBC 5622262 : ss->states = &d->statesarea[i * d->wordsper];
1063 5622262 : ss->flags = 0;
1064 5622262 : ss->ins.ss = NULL;
7368 tgl 1065 GIC 5622262 : ss->ins.co = WHITE; /* give it some value */
7368 tgl 1066 CBC 5622262 : ss->outs = &d->outsarea[i * d->ncolors];
1067 5622262 : ss->inchain = &d->incarea[i * d->ncolors];
7188 bruce 1068 GIC 48614266 : for (i = 0; i < d->ncolors; i++)
7188 bruce 1069 ECB : {
7368 tgl 1070 GIC 42992004 : ss->outs[i] = NULL;
1071 42992004 : ss->inchain[i].ss = NULL;
1072 : }
7368 tgl 1073 CBC 5622262 : return ss;
7368 tgl 1074 ECB : }
1075 :
7368 tgl 1076 EUB : /* look for oldest, or old enough anyway */
7188 bruce 1077 CBC 66 : if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
1078 66 : ancient = cp - d->nssets * 2 / 3;
7368 tgl 1079 ECB : else
7368 tgl 1080 UIC 0 : ancient = start;
7368 tgl 1081 CBC 72 : for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
7368 tgl 1082 GIC 66 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
7188 bruce 1083 CBC 66 : !(ss->flags & LOCKED))
1084 : {
7368 tgl 1085 60 : d->search = ss + 1;
5664 tgl 1086 ECB : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
7368 tgl 1087 CBC 60 : return ss;
1088 : }
1089 12 : for (ss = d->ssets, end = d->search; ss < end; ss++)
7368 tgl 1090 GIC 12 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
7188 bruce 1091 CBC 12 : !(ss->flags & LOCKED))
1092 : {
7368 tgl 1093 GIC 6 : d->search = ss + 1;
1094 : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
1095 6 : return ss;
7368 tgl 1096 EUB : }
1097 :
1098 : /* nobody's old enough?!? -- something's really wrong */
1099 : FDEBUG(("cannot find victim to replace!\n"));
7368 tgl 1100 UIC 0 : ERR(REG_ASSERT);
2746 1101 0 : return NULL;
1102 : }
|