Age Owner TLA Line data Source code
1 : /*
2 : * re_*exec and friends - match REs
3 : *
4 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 : *
6 : * Development of this software was funded, in part, by Cray Research Inc.,
7 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 : * Corporation, none of whom are responsible for the results. The author
9 : * thanks all of them.
10 : *
11 : * Redistribution and use in source and binary forms -- with or without
12 : * modification -- are permitted for any purpose, provided that
13 : * redistributions in source form retain this entire copyright notice and
14 : * indicate the origin and nature of any modifications.
15 : *
16 : * I'd appreciate being given credit for this package in the documentation
17 : * of software which uses it, but that is not a requirement.
18 : *
19 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : * src/backend/regex/regexec.c
31 : *
32 : */
33 :
34 : #include "regex/regguts.h"
35 :
36 :
37 :
38 : /* lazy-DFA representation */
39 : struct arcp
40 : { /* "pointer" to an outarc */
41 : struct sset *ss;
42 : color co;
43 : };
44 :
45 : struct sset
46 : { /* state set */
47 : unsigned *states; /* pointer to bitvector */
48 : unsigned hash; /* hash of bitvector */
49 : #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 : #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 : memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 : int flags;
53 : #define STARTER 01 /* the initial state set */
54 : #define POSTSTATE 02 /* includes the goal state */
55 : #define LOCKED 04 /* locked in cache */
56 : #define NOPROGRESS 010 /* zero-progress state set */
57 : struct arcp ins; /* chain of inarcs pointing here */
58 : chr *lastseen; /* last entered on arrival here */
59 : struct sset **outs; /* outarc vector indexed by color */
60 : struct arcp *inchain; /* chain-pointer vector for outarcs */
61 : };
62 :
63 : struct dfa
64 : {
65 : int nssets; /* size of cache */
66 : int nssused; /* how many entries occupied yet */
67 : int nstates; /* number of states */
68 : int ncolors; /* length of outarc and inchain vectors */
69 : int wordsper; /* length of state-set bitvectors */
70 : struct sset *ssets; /* state-set cache */
71 : unsigned *statesarea; /* bitvector storage */
72 : unsigned *work; /* pointer to work area within statesarea */
73 : struct sset **outsarea; /* outarc-vector storage */
74 : struct arcp *incarea; /* inchain storage */
75 : struct cnfa *cnfa;
76 : struct colormap *cm;
77 : chr *lastpost; /* location of last cache-flushed success */
78 : chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 : struct sset *search; /* replacement-search-pointer memory */
80 : int backno; /* if DFA for a backref, subno it refers to */
81 : short backmin; /* min repetitions for backref */
82 : short backmax; /* max repetitions for backref */
83 : bool ismalloced; /* should this struct dfa be freed? */
84 : bool arraysmalloced; /* should its subsidiary arrays be freed? */
85 : };
86 :
87 : #define WORK 1 /* number of work bitvectors needed */
88 :
89 : /* setup for non-malloc allocation for small cases */
90 : #define FEWSTATES 20 /* must be less than UBITS */
91 : #define FEWCOLORS 15
92 : struct smalldfa
93 : {
94 : struct dfa dfa; /* must be first */
95 : struct sset ssets[FEWSTATES * 2];
96 : unsigned statesarea[FEWSTATES * 2 + WORK];
97 : struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
98 : struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
99 : };
100 :
101 : #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
102 :
103 :
104 :
105 : /* internal variables, bundled for easy passing around */
106 : struct vars
107 : {
108 : regex_t *re;
109 : struct guts *g;
110 : int eflags; /* copies of arguments */
111 : size_t nmatch;
112 : regmatch_t *pmatch;
113 : rm_detail_t *details;
114 : chr *start; /* start of string */
115 : chr *search_start; /* search start of string */
116 : chr *stop; /* just past end of string */
117 : int err; /* error code if any (0 none) */
118 : struct dfa **subdfas; /* per-tree-subre DFAs */
119 : struct dfa **ladfas; /* per-lacon-subre DFAs */
120 : struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
121 : chr **lblastcp; /* per-lacon-subre lookbehind restart data */
122 : struct smalldfa dfa1;
123 : struct smalldfa dfa2;
124 : };
125 :
126 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
127 : #define ISERR() VISERR(v)
128 : #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
129 : #define ERR(e) VERR(v, e) /* record an error */
130 : #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
131 : #define OFF(p) ((p) - v->start)
132 : #define LOFF(p) ((long)OFF(p))
133 :
134 :
135 :
136 : /*
137 : * forward declarations
138 : */
139 : /* === regexec.c === */
140 : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
141 : static struct dfa *getladfa(struct vars *v, int n);
142 : static int find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
143 : static int cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
144 : static int cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
145 : struct dfa *d, struct dfa *s, chr **coldp);
146 : static void zapallsubs(regmatch_t *p, size_t n);
147 : static void zaptreesubs(struct vars *v, struct subre *t);
148 : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
149 : static int cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
150 : static int ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
151 : static int crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
152 : static int cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
153 : static int caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
154 : static int citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
155 : static int creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
156 :
157 : /* === rege_dfa.c === */
158 : static chr *longest(struct vars *v, struct dfa *d,
159 : chr *start, chr *stop, int *hitstopp);
160 : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
161 : chr *max, chr **coldp, int *hitstopp);
162 : static int matchuntil(struct vars *v, struct dfa *d, chr *probe,
163 : struct sset **lastcss, chr **lastcp);
164 : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
165 : chr *min, chr *max, bool shortest);
166 : static chr *lastcold(struct vars *v, struct dfa *d);
167 : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
168 : struct colormap *cm, struct smalldfa *sml);
169 : static void freedfa(struct dfa *d);
170 : static unsigned hash(unsigned *uv, int n);
171 : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
172 : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
173 : color co, chr *cp, chr *start);
174 : static int lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
175 : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
176 : chr *start);
177 : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
178 : chr *start);
179 :
180 :
181 : /*
182 : * pg_regexec - match regular expression
183 : */
184 : int
7368 tgl 185 GIC 901896 : pg_regexec(regex_t *re,
186 : const chr *string,
187 : size_t len,
188 : size_t search_start,
189 : rm_detail_t *details,
190 : size_t nmatch,
191 : regmatch_t pmatch[],
192 : int flags)
193 : {
7368 tgl 194 ECB : struct vars var;
197 andres 195 GNC 901896 : struct vars *v = &var;
196 : int st;
197 : size_t n;
198 : size_t i;
199 : int backref;
200 :
201 : #define LOCALMAT 20
202 : regmatch_t mat[LOCALMAT];
203 :
4062 tgl 204 ECB : #define LOCALDFAS 40
205 : struct dfa *subdfas[LOCALDFAS];
206 :
207 : /* sanity checks */
7368 tgl 208 GIC 901896 : if (re == NULL || string == NULL || re->re_magic != REMAGIC)
7368 tgl 209 UIC 0 : return REG_INVARG;
7368 tgl 210 GIC 901896 : if (re->re_csize != sizeof(chr))
7368 tgl 211 UIC 0 : return REG_MIXED;
575 tgl 212 GIC 901896 : if (search_start > len)
213 3 : return REG_NOMATCH;
214 :
215 : /* Initialize locale-dependent support */
4382 216 901893 : pg_set_regex_collation(re->re_collation);
4382 tgl 217 ECB :
7368 tgl 218 EUB : /* setup */
7368 tgl 219 CBC 901893 : v->re = re;
7188 bruce 220 GBC 901893 : v->g = (struct guts *) re->re_guts;
7188 bruce 221 CBC 901893 : if ((v->g->cflags & REG_EXPECT) && details == NULL)
7368 tgl 222 LBC 0 : return REG_INVARG;
7188 bruce 223 GIC 901893 : if (v->g->info & REG_UIMPOSSIBLE)
7368 tgl 224 715 : return REG_NOMATCH;
7188 bruce 225 CBC 901178 : backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
7368 tgl 226 GIC 901178 : v->eflags = flags;
608 227 901178 : if (backref && nmatch <= v->g->nsub)
7188 bruce 228 ECB : {
608 tgl 229 : /* need larger work area */
594 tgl 230 CBC 91 : v->nmatch = v->g->nsub + 1;
594 tgl 231 GBC 91 : if (v->nmatch <= LOCALMAT)
7368 tgl 232 CBC 90 : v->pmatch = mat;
7368 tgl 233 ECB : else
594 tgl 234 CBC 1 : v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
7368 235 91 : if (v->pmatch == NULL)
7368 tgl 236 LBC 0 : return REG_ESPACE;
594 tgl 237 GIC 91 : zapallsubs(v->pmatch, v->nmatch);
238 : }
7188 bruce 239 ECB : else
608 tgl 240 : {
241 : /* we can store results directly in caller's array */
7368 tgl 242 GIC 901087 : v->pmatch = pmatch;
608 tgl 243 ECB : /* ensure any extra entries in caller's array are filled with -1 */
608 tgl 244 CBC 901087 : if (nmatch > 0)
608 tgl 245 GBC 556472 : zapallsubs(pmatch, nmatch);
608 tgl 246 ECB : /* then forget about extra entries, to avoid useless work in find() */
608 tgl 247 GIC 901087 : if (nmatch > v->g->nsub + 1)
248 677 : nmatch = v->g->nsub + 1;
249 901087 : v->nmatch = nmatch;
250 : }
7368 tgl 251 CBC 901178 : v->details = details;
7188 bruce 252 GIC 901178 : v->start = (chr *) string;
6482 bruce 253 CBC 901178 : v->search_start = (chr *) string + search_start;
7188 254 901178 : v->stop = (chr *) string + len;
7368 tgl 255 GIC 901178 : v->err = 0;
2718 tgl 256 CBC 901178 : v->subdfas = NULL;
257 901178 : v->ladfas = NULL;
258 901178 : v->lblastcss = NULL;
2718 tgl 259 GIC 901178 : v->lblastcp = NULL;
2718 tgl 260 ECB : /* below this point, "goto cleanup" will behave sanely */
261 :
4062 tgl 262 CBC 901178 : assert(v->g->ntree >= 0);
263 901178 : n = (size_t) v->g->ntree;
264 901178 : if (n <= LOCALDFAS)
265 901176 : v->subdfas = subdfas;
4062 tgl 266 ECB : else
267 : {
2718 tgl 268 CBC 2 : v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
2718 tgl 269 GIC 2 : if (v->subdfas == NULL)
270 : {
2718 tgl 271 LBC 0 : st = REG_ESPACE;
272 0 : goto cleanup;
2718 tgl 273 ECB : }
4062 274 : }
4062 tgl 275 GIC 2719024 : for (i = 0; i < n; i++)
276 1817846 : v->subdfas[i] = NULL;
7368 tgl 277 ECB :
2718 tgl 278 CBC 901178 : assert(v->g->nlacons >= 0);
2718 tgl 279 GIC 901178 : n = (size_t) v->g->nlacons;
2718 tgl 280 GBC 901178 : if (n > 0)
2718 tgl 281 EUB : {
2718 tgl 282 GIC 630 : v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
283 630 : if (v->ladfas == NULL)
2718 tgl 284 ECB : {
2718 tgl 285 LBC 0 : st = REG_ESPACE;
2718 tgl 286 UIC 0 : goto cleanup;
2718 tgl 287 ECB : }
2718 tgl 288 CBC 1922 : for (i = 0; i < n; i++)
289 1292 : v->ladfas[i] = NULL;
2718 tgl 290 GIC 630 : v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
2718 tgl 291 CBC 630 : v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
292 630 : if (v->lblastcss == NULL || v->lblastcp == NULL)
293 : {
2718 tgl 294 UBC 0 : st = REG_ESPACE;
295 0 : goto cleanup;
296 : }
2718 tgl 297 CBC 1922 : for (i = 0; i < n; i++)
2718 tgl 298 ECB : {
2718 tgl 299 CBC 1292 : v->lblastcss[i] = NULL;
300 1292 : v->lblastcp[i] = NULL;
2718 tgl 301 ECB : }
302 : }
2718 tgl 303 EUB :
7368 304 : /* do it */
7368 tgl 305 GIC 901178 : assert(v->g->tree != NULL);
7368 tgl 306 CBC 901178 : if (backref)
7368 tgl 307 GIC 145 : st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
7368 tgl 308 ECB : else
7368 tgl 309 CBC 901033 : st = find(v, &v->g->tree->cnfa, &v->g->cmap);
310 :
311 : /* on success, ensure caller's match vector is filled correctly */
608 tgl 312 GIC 901178 : if (st == REG_OKAY && nmatch > 0)
313 : {
608 tgl 314 CBC 451389 : if (v->pmatch != pmatch)
608 tgl 315 ECB : {
316 : /* copy portion of match vector over from (larger) work area */
608 tgl 317 GIC 21 : assert(nmatch <= v->nmatch);
608 tgl 318 CBC 21 : memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
319 : }
608 tgl 320 GIC 451389 : if (v->g->cflags & REG_NOSUB)
608 tgl 321 ECB : {
322 : /* don't expose possibly-partial sub-match results to caller */
608 tgl 323 CBC 448309 : zapallsubs(pmatch, nmatch);
324 : }
325 : }
7368 tgl 326 ECB :
327 : /* clean up */
2718 tgl 328 GIC 452869 : cleanup:
7368 tgl 329 CBC 901178 : if (v->pmatch != pmatch && v->pmatch != mat)
7368 tgl 330 GIC 1 : FREE(v->pmatch);
2718 331 901178 : if (v->subdfas != NULL)
2718 tgl 332 ECB : {
2718 tgl 333 GIC 901178 : n = (size_t) v->g->ntree;
334 2719024 : for (i = 0; i < n; i++)
335 : {
336 1817846 : if (v->subdfas[i] != NULL)
2718 tgl 337 CBC 12368 : freedfa(v->subdfas[i]);
2718 tgl 338 ECB : }
2718 tgl 339 CBC 901178 : if (v->subdfas != subdfas)
340 2 : FREE(v->subdfas);
341 : }
342 901178 : if (v->ladfas != NULL)
4062 tgl 343 ECB : {
2718 tgl 344 GIC 630 : n = (size_t) v->g->nlacons;
2718 tgl 345 CBC 1922 : for (i = 0; i < n; i++)
2718 tgl 346 ECB : {
2718 tgl 347 GIC 1292 : if (v->ladfas[i] != NULL)
2718 tgl 348 CBC 53 : freedfa(v->ladfas[i]);
2718 tgl 349 ECB : }
2718 tgl 350 GIC 630 : FREE(v->ladfas);
4062 tgl 351 ECB : }
2718 tgl 352 GIC 901178 : if (v->lblastcss != NULL)
2718 tgl 353 CBC 630 : FREE(v->lblastcss);
354 901178 : if (v->lblastcp != NULL)
2718 tgl 355 GIC 630 : FREE(v->lblastcp);
4062 tgl 356 ECB :
781 357 : #ifdef REG_DEBUG
358 : if (v->eflags & (REG_FTRACE | REG_MTRACE))
359 : fflush(stdout);
360 : #endif
361 :
7368 tgl 362 CBC 901178 : return st;
7368 tgl 363 ECB : }
364 :
365 : /*
366 : * getsubdfa - create or re-fetch the DFA for a tree subre node
367 : *
368 : * We only need to create the DFA once per overall regex execution.
369 : * The DFA will be freed by the cleanup step in pg_regexec().
370 : */
4062 371 : static struct dfa *
2118 tgl 372 GIC 13054 : getsubdfa(struct vars *v,
373 : struct subre *t)
374 : {
768 375 13054 : struct dfa *d = v->subdfas[t->id];
376 :
377 13054 : if (d == NULL)
378 : {
379 12368 : d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
762 380 12368 : if (d == NULL)
4062 tgl 381 LBC 0 : return NULL;
382 : /* set up additional info if this is a backref node */
768 tgl 383 GIC 12368 : if (t->op == 'b')
768 tgl 384 ECB : {
768 tgl 385 GIC 140 : d->backno = t->backno;
768 tgl 386 CBC 140 : d->backmin = t->min;
768 tgl 387 GIC 140 : d->backmax = t->max;
768 tgl 388 ECB : }
768 tgl 389 CBC 12368 : v->subdfas[t->id] = d;
4062 tgl 390 EUB : }
768 tgl 391 GIC 13054 : return d;
4062 tgl 392 ECB : }
393 :
2718 394 : /*
395 : * getladfa - create or re-fetch the DFA for a LACON subre node
396 : *
397 : * Same as above, but for LACONs.
398 : */
399 : static struct dfa *
2118 tgl 400 CBC 120 : getladfa(struct vars *v,
401 : int n)
402 : {
2718 tgl 403 GIC 120 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
404 :
405 120 : if (v->ladfas[n] == NULL)
406 : {
407 53 : struct subre *sub = &v->g->lacons[n];
408 :
2718 tgl 409 CBC 53 : v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
410 : /* a LACON can't contain a backref, so nothing else to do */
411 : }
412 120 : return v->ladfas[n];
413 : }
2718 tgl 414 ECB :
415 : /*
7368 416 : * find - find a match for the main NFA (no-complications case)
417 : */
418 : static int
2118 tgl 419 GIC 901033 : find(struct vars *v,
420 : struct cnfa *cnfa,
2118 tgl 421 ECB : struct colormap *cm)
422 : {
423 : struct dfa *s;
424 : struct dfa *d;
425 : chr *begin;
7188 bruce 426 GIC 901033 : chr *end = NULL;
427 : chr *cold;
6385 bruce 428 ECB : chr *open; /* open and close of range of possible starts */
429 : chr *close;
430 : int hitend;
7188 bruce 431 GIC 901033 : int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
432 :
433 : /* first, a shot with the search RE */
7368 tgl 434 901033 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
762 tgl 435 CBC 901033 : if (s == NULL)
762 tgl 436 UIC 0 : return v->err;
437 : MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
7368 tgl 438 GIC 901033 : cold = NULL;
6482 bruce 439 901033 : close = shortest(v, s, v->search_start, v->search_start, v->stop,
6482 bruce 440 ECB : &cold, (int *) NULL);
7368 tgl 441 GIC 901033 : freedfa(s);
442 901033 : NOERR();
7188 bruce 443 CBC 901033 : if (v->g->cflags & REG_EXPECT)
7188 bruce 444 ECB : {
7368 tgl 445 GBC 27 : assert(v->details != NULL);
7368 tgl 446 GIC 27 : if (cold != NULL)
7368 tgl 447 CBC 27 : v->details->rm_extend.rm_so = OFF(cold);
7368 tgl 448 ECB : else
7368 tgl 449 UIC 0 : v->details->rm_extend.rm_so = OFF(v->stop);
2118 tgl 450 CBC 27 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
7368 tgl 451 ECB : }
7188 bruce 452 CBC 901033 : if (close == NULL) /* not found */
7368 tgl 453 GIC 330758 : return REG_NOMATCH;
7188 bruce 454 CBC 570275 : if (v->nmatch == 0) /* found, don't need exact location */
7368 tgl 455 118944 : return REG_OKAY;
7368 tgl 456 ECB :
457 : /* find starting point and match */
7368 tgl 458 GBC 451331 : assert(cold != NULL);
7368 tgl 459 CBC 451331 : open = cold;
7368 tgl 460 GIC 451331 : cold = NULL;
7368 tgl 461 ECB : MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
7368 tgl 462 CBC 451331 : d = newdfa(v, cnfa, cm, &v->dfa1);
762 463 451331 : if (d == NULL)
762 tgl 464 LBC 0 : return v->err;
7188 bruce 465 GIC 451360 : for (begin = open; begin <= close; begin++)
466 : {
7368 tgl 467 ECB : MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
7368 tgl 468 CBC 451360 : if (shorter)
469 61 : end = shortest(v, d, begin, begin, v->stop,
470 : (chr **) NULL, &hitend);
7368 tgl 471 ECB : else
7368 tgl 472 CBC 451299 : end = longest(v, d, begin, v->stop, &hitend);
2760 tgl 473 GBC 451360 : if (ISERR())
2760 tgl 474 ECB : {
2760 tgl 475 UIC 0 : freedfa(d);
476 0 : return v->err;
2760 tgl 477 ECB : }
7368 tgl 478 CBC 451360 : if (hitend && cold == NULL)
7368 tgl 479 GIC 3061 : cold = begin;
480 451360 : if (end != NULL)
7188 bruce 481 CBC 451331 : break; /* NOTE BREAK OUT */
7368 tgl 482 ECB : }
7368 tgl 483 GIC 451331 : assert(end != NULL); /* search RE succeeded so loop should */
7368 tgl 484 GBC 451331 : freedfa(d);
7368 tgl 485 EUB :
486 : /* and pin down details */
7368 tgl 487 CBC 451331 : assert(v->nmatch > 0);
488 451331 : v->pmatch[0].rm_so = OFF(begin);
489 451331 : v->pmatch[0].rm_eo = OFF(end);
7188 bruce 490 451331 : if (v->g->cflags & REG_EXPECT)
491 : {
7368 tgl 492 10 : if (cold != NULL)
493 5 : v->details->rm_extend.rm_so = OFF(cold);
494 : else
7368 tgl 495 GIC 5 : v->details->rm_extend.rm_so = OFF(v->stop);
2118 tgl 496 CBC 10 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
7368 tgl 497 ECB : }
7188 bruce 498 CBC 451331 : if (v->nmatch == 1) /* no need for submatches */
7368 tgl 499 449125 : return REG_OKAY;
500 :
4062 tgl 501 ECB : /* find submatches */
4062 tgl 502 CBC 2206 : return cdissect(v, v->g->tree, begin, end);
503 : }
7368 tgl 504 ECB :
505 : /*
506 : * cfind - find a match for the main NFA (with complications)
507 : */
508 : static int
2118 tgl 509 GIC 145 : cfind(struct vars *v,
510 : struct cnfa *cnfa,
2118 tgl 511 ECB : struct colormap *cm)
512 : {
513 : struct dfa *s;
514 : struct dfa *d;
515 : chr *cold;
516 : int ret;
517 :
7368 tgl 518 CBC 145 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
762 tgl 519 GIC 145 : if (s == NULL)
762 tgl 520 UIC 0 : return v->err;
7368 tgl 521 GIC 145 : d = newdfa(v, cnfa, cm, &v->dfa2);
762 522 145 : if (d == NULL)
523 : {
7368 tgl 524 UIC 0 : freedfa(s);
525 0 : return v->err;
526 : }
7368 tgl 527 ECB :
7368 tgl 528 CBC 145 : ret = cfindloop(v, cnfa, cm, d, s, &cold);
7368 tgl 529 EUB :
7368 tgl 530 CBC 145 : freedfa(d);
531 145 : freedfa(s);
7368 tgl 532 GIC 145 : NOERR();
7188 bruce 533 GBC 145 : if (v->g->cflags & REG_EXPECT)
7188 bruce 534 EUB : {
7368 tgl 535 UIC 0 : assert(v->details != NULL);
536 0 : if (cold != NULL)
7368 tgl 537 LBC 0 : v->details->rm_extend.rm_so = OFF(cold);
538 : else
539 0 : v->details->rm_extend.rm_so = OFF(v->stop);
2118 540 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
7368 tgl 541 ECB : }
7368 tgl 542 CBC 145 : return ret;
543 : }
7368 tgl 544 EUB :
545 : /*
546 : * cfindloop - the heart of cfind
547 : */
548 : static int
2118 tgl 549 GBC 145 : cfindloop(struct vars *v,
550 : struct cnfa *cnfa,
2118 tgl 551 ECB : struct colormap *cm,
552 : struct dfa *d,
553 : struct dfa *s,
554 : chr **coldp) /* where to put coldstart pointer */
555 : {
556 : chr *begin;
557 : chr *end;
7188 bruce 558 : chr *cold;
559 : chr *open; /* open and close of range of possible starts */
560 : chr *close;
561 : chr *estart;
562 : chr *estop;
563 : int er;
7188 bruce 564 GIC 145 : int shorter = v->g->tree->flags & SHORTER;
565 : int hitend;
566 :
7368 tgl 567 145 : assert(d != NULL && s != NULL);
568 145 : cold = NULL;
6482 bruce 569 145 : close = v->search_start;
570 : do
571 : {
572 : /* Search with the search RE for match range at/beyond "close" */
7368 tgl 573 ECB : MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
7188 bruce 574 GIC 161 : close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
2746 tgl 575 161 : if (ISERR())
2746 tgl 576 ECB : {
2746 tgl 577 LBC 0 : *coldp = cold;
578 0 : return v->err;
579 : }
7368 tgl 580 GIC 161 : if (close == NULL)
2746 581 15 : break; /* no more possible match anywhere */
7368 582 146 : assert(cold != NULL);
7368 tgl 583 CBC 146 : open = cold;
584 146 : cold = NULL;
585 : /* Search for matches starting between "open" and "close" inclusive */
7368 tgl 586 EUB : MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
7188 bruce 587 GBC 513 : for (begin = open; begin <= close; begin++)
588 : {
7368 tgl 589 ECB : MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
7368 tgl 590 CBC 452 : estart = begin;
591 452 : estop = v->stop;
7188 bruce 592 ECB : for (;;)
593 : {
594 : /* Here we use the top node's detailed RE */
7368 tgl 595 GIC 646 : if (shorter)
7368 tgl 596 CBC 97 : end = shortest(v, d, begin, estart,
597 : estop, (chr **) NULL, &hitend);
598 : else
599 549 : end = longest(v, d, begin, estop,
7188 bruce 600 ECB : &hitend);
2746 tgl 601 GIC 646 : if (ISERR())
602 : {
2746 tgl 603 UIC 0 : *coldp = cold;
2746 tgl 604 LBC 0 : return v->err;
2746 tgl 605 ECB : }
7368 tgl 606 GIC 646 : if (hitend && cold == NULL)
607 104 : cold = begin;
7368 tgl 608 CBC 646 : if (end == NULL)
2746 tgl 609 GIC 349 : break; /* no match with this begin point, try next */
7368 tgl 610 ECB : MDEBUG(("tentative end %ld\n", LOFF(end)));
611 : /* Dissect the potential match to see if it really matches */
7368 tgl 612 GBC 297 : er = cdissect(v, v->g->tree, begin, end);
7188 bruce 613 297 : if (er == REG_OKAY)
614 : {
7188 bruce 615 CBC 85 : if (v->nmatch > 0)
7188 bruce 616 ECB : {
7368 tgl 617 CBC 85 : v->pmatch[0].rm_so = OFF(begin);
618 85 : v->pmatch[0].rm_eo = OFF(end);
619 : }
7368 tgl 620 GIC 85 : *coldp = cold;
7368 tgl 621 CBC 85 : return REG_OKAY;
7368 tgl 622 ECB : }
7188 bruce 623 GIC 212 : if (er != REG_NOMATCH)
7188 bruce 624 ECB : {
7368 tgl 625 UIC 0 : ERR(er);
6406 tgl 626 LBC 0 : *coldp = cold;
7368 627 0 : return er;
628 : }
2746 tgl 629 ECB : /* Try next longer/shorter match with same begin point */
7368 tgl 630 CBC 212 : if (shorter)
631 : {
3552 632 81 : if (end == estop)
2746 tgl 633 GIC 12 : break; /* no more, so try next begin point */
7368 tgl 634 GBC 69 : estart = end + 1;
3552 tgl 635 EUB : }
7368 636 : else
637 : {
3552 tgl 638 GIC 131 : if (end == begin)
2746 tgl 639 CBC 6 : break; /* no more, so try next begin point */
7368 tgl 640 GIC 125 : estop = end - 1;
3552 tgl 641 ECB : }
642 : } /* end loop over endpoint positions */
643 : } /* end loop over beginning positions */
644 :
645 : /*
646 : * If we get here, there is no possible match starting at or before
2746 647 : * "close", so consider matches beyond that. We'll do a fresh search
648 : * with the search RE to find a new promising match range.
649 : */
2746 tgl 650 GIC 61 : close++;
7368 651 61 : } while (close < v->stop);
652 :
653 60 : *coldp = cold;
654 60 : return REG_NOMATCH;
655 : }
656 :
657 : /*
658 : * zapallsubs - initialize all subexpression matches to "no match"
594 tgl 659 ECB : *
660 : * Note that p[0], the overall-match location, is not touched.
661 : */
7368 662 : static void
4062 tgl 663 CBC 1004872 : zapallsubs(regmatch_t *p,
664 : size_t n)
665 : {
666 : size_t i;
667 :
7188 bruce 668 GIC 1012596 : for (i = n - 1; i > 0; i--)
669 : {
7368 tgl 670 7724 : p[i].rm_so = -1;
671 7724 : p[i].rm_eo = -1;
7368 tgl 672 ECB : }
7368 tgl 673 GIC 1004872 : }
674 :
675 : /*
676 : * zaptreesubs - initialize subexpressions within subtree to "no match"
7368 tgl 677 ECB : */
678 : static void
2118 tgl 679 CBC 520 : zaptreesubs(struct vars *v,
2118 tgl 680 ECB : struct subre *t)
681 : {
778 tgl 682 CBC 520 : int n = t->capno;
683 : struct subre *t2;
684 :
778 tgl 685 GIC 520 : if (n > 0)
686 : {
3972 687 243 : if ((size_t) n < v->nmatch)
3972 tgl 688 ECB : {
3972 tgl 689 GIC 243 : v->pmatch[n].rm_so = -1;
690 243 : v->pmatch[n].rm_eo = -1;
3972 tgl 691 ECB : }
692 : }
693 :
778 tgl 694 CBC 695 : for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
778 tgl 695 GIC 175 : zaptreesubs(v, t2);
7368 tgl 696 CBC 520 : }
697 :
7368 tgl 698 ECB : /*
3972 699 : * subset - set subexpression match data for a successful subre
700 : */
701 : static void
2118 tgl 702 GIC 4059 : subset(struct vars *v,
2118 tgl 703 ECB : struct subre *sub,
7184 bruce 704 : chr *begin,
705 : chr *end)
706 : {
778 tgl 707 GIC 4059 : int n = sub->capno;
708 :
7368 709 4059 : assert(n > 0);
7188 bruce 710 4059 : if ((size_t) n >= v->nmatch)
7368 tgl 711 CBC 9 : return;
712 :
713 : MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
7368 tgl 714 GIC 4050 : v->pmatch[n].rm_so = OFF(begin);
715 4050 : v->pmatch[n].rm_eo = OFF(end);
7368 tgl 716 ECB : }
717 :
718 : /*
4062 719 : * cdissect - check backrefs and determine subexpression matches
720 : *
721 : * cdissect recursively processes a subre tree to check matching of backrefs
722 : * and/or identify submatch boundaries for capture nodes. The proposed match
723 : * runs from "begin" to "end" (not including "end"), and we are basically
724 : * "dissecting" it to see where the submatches are.
725 : *
726 : * Before calling any level of cdissect, the caller must have run the node's
727 : * DFA and found that the proposed substring satisfies the DFA. (We make
728 : * the caller do that because in concatenation and iteration nodes, it's
729 : * much faster to check all the substrings against the child DFAs before we
730 : * recurse.)
731 : *
732 : * A side-effect of a successful match is to save match locations for
733 : * capturing subexpressions in v->pmatch[]. This is a little bit tricky,
734 : * so we make the following rules:
735 : * 1. Before initial entry to cdissect, all match data must have been
736 : * cleared (this is seen to by zapallsubs).
737 : * 2. Before any recursive entry to cdissect, the match data for that
738 : * subexpression tree must be guaranteed clear (see zaptreesubs).
739 : * 3. When returning REG_OKAY, each level of cdissect will have saved
740 : * any relevant match locations.
741 : * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
742 : * that its subexpression match locations are again clear.
743 : * 5. No guarantees are made for error cases (i.e., other result codes).
744 : * 6. When a level of cdissect abandons a successful sub-match, it will
745 : * clear that subtree's match locations with zaptreesubs before trying
746 : * any new DFA match or cdissect call for that subtree or any subtree
747 : * to its right (that is, any subtree that could have a backref into the
748 : * abandoned match).
749 : * This may seem overly complicated, but it's difficult to simplify it
750 : * because of the provision that match locations must be reset before
751 : * any fresh DFA match (a rule that is needed to make dfa_backref safe).
752 : * That means it won't work to just reset relevant match locations at the
753 : * start of each cdissect level.
754 : */
755 : static int /* regexec return code */
2118 tgl 756 GIC 15248 : cdissect(struct vars *v,
757 : struct subre *t,
758 : chr *begin, /* beginning of relevant substring */
759 : chr *end) /* end of same */
760 : {
761 : int er;
762 :
7368 763 15248 : assert(t != NULL);
764 : MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
7368 tgl 765 ECB :
766 : /* handy place to check for operation cancel */
1 tmunro 767 GNC 15248 : INTERRUPT(v->re);
768 : /* ... and stack overrun */
2746 tgl 769 GIC 15248 : if (STACK_TOO_DEEP(v->re))
2746 tgl 770 UIC 0 : return REG_ETOOBIG;
3326 tgl 771 ECB :
7188 bruce 772 GIC 15248 : switch (t->op)
773 : {
774 8407 : case '=': /* terminal node */
778 tgl 775 CBC 8407 : assert(t->child == NULL);
4062 tgl 776 GIC 8407 : er = REG_OKAY; /* no action, parent did the work */
4062 tgl 777 CBC 8407 : break;
4066 tgl 778 GBC 209 : case 'b': /* back reference */
778 tgl 779 GIC 209 : assert(t->child == NULL);
4062 tgl 780 CBC 209 : er = cbrdissect(v, t, begin, end);
4062 tgl 781 GIC 209 : break;
7188 bruce 782 CBC 6422 : case '.': /* concatenation */
778 tgl 783 6422 : assert(t->child != NULL);
784 6422 : if (t->child->flags & SHORTER) /* reverse scan */
4062 785 164 : er = crevcondissect(v, t, begin, end);
4062 tgl 786 ECB : else
4062 tgl 787 CBC 6258 : er = ccondissect(v, t, begin, end);
788 6422 : break;
789 68 : case '|': /* alternation */
778 790 68 : assert(t->child != NULL);
4062 791 68 : er = caltdissect(v, t, begin, end);
792 68 : break;
793 112 : case '*': /* iteration */
778 tgl 794 GIC 112 : assert(t->child != NULL);
778 tgl 795 CBC 112 : if (t->child->flags & SHORTER) /* reverse scan */
4062 796 7 : er = creviterdissect(v, t, begin, end);
4062 tgl 797 ECB : else
4062 tgl 798 CBC 105 : er = citerdissect(v, t, begin, end);
799 112 : break;
778 800 30 : case '(': /* no-op capture node */
801 30 : assert(t->child != NULL);
802 30 : er = cdissect(v, t->child, begin, end);
4062 803 30 : break;
7188 bruce 804 LBC 0 : default:
4062 tgl 805 UIC 0 : er = REG_ASSERT;
4062 tgl 806 LBC 0 : break;
7368 tgl 807 ECB : }
4062 808 :
809 : /*
810 : * We should never have a match failure unless backrefs lurk below;
811 : * otherwise, either caller failed to check the DFA, or there's some
4062 tgl 812 EUB : * inconsistency between the DFA and the node's innards.
813 : */
4062 tgl 814 GBC 15248 : assert(er != REG_NOMATCH || (t->flags & BACKR));
815 :
816 : /*
817 : * If this node is marked as capturing, save successful match's location.
818 : */
778 tgl 819 GIC 15248 : if (t->capno > 0 && er == REG_OKAY)
820 4059 : subset(v, t, begin, end);
821 :
4062 tgl 822 CBC 15248 : return er;
823 : }
824 :
825 : /*
826 : * ccondissect - dissect match for concatenation node
7368 tgl 827 ECB : */
7188 bruce 828 : static int /* regexec return code */
2118 tgl 829 GIC 6258 : ccondissect(struct vars *v,
2118 tgl 830 ECB : struct subre *t,
831 : chr *begin, /* beginning of relevant substring */
832 : chr *end) /* end of same */
833 : {
778 tgl 834 GIC 6258 : struct subre *left = t->child;
835 6258 : struct subre *right = left->sibling;
836 : struct dfa *d;
7368 tgl 837 ECB : struct dfa *d2;
838 : chr *mid;
839 : int er;
840 :
7368 tgl 841 GIC 6258 : assert(t->op == '.');
778 tgl 842 CBC 6258 : assert(left != NULL && left->cnfa.nstates > 0);
843 6258 : assert(right != NULL && right->cnfa.nstates > 0);
778 tgl 844 GIC 6258 : assert(right->sibling == NULL);
845 6258 : assert(!(left->flags & SHORTER));
846 :
847 6258 : d = getsubdfa(v, left);
4062 848 6258 : NOERR();
778 tgl 849 CBC 6258 : d2 = getsubdfa(v, right);
4062 850 6258 : NOERR();
781 tgl 851 ECB : MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
7368 852 :
853 : /* pick a tentative midpoint */
4062 tgl 854 GIC 6258 : mid = longest(v, d, begin, end, (int *) NULL);
2746 tgl 855 CBC 6258 : NOERR();
4062 856 6258 : if (mid == NULL)
857 8 : return REG_NOMATCH;
781 tgl 858 ECB : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
859 :
860 : /* iterate until satisfaction or failure */
861 : for (;;)
7188 bruce 862 : {
7368 tgl 863 : /* try this midpoint on for size */
4815 tgl 864 CBC 7772 : if (longest(v, d2, mid, end, (int *) NULL) == end)
7188 bruce 865 ECB : {
778 tgl 866 GIC 6209 : er = cdissect(v, left, begin, mid);
4815 867 6209 : if (er == REG_OKAY)
868 : {
778 869 6159 : er = cdissect(v, right, mid, end);
4815 870 6159 : if (er == REG_OKAY)
871 : {
4815 tgl 872 ECB : /* satisfaction */
873 : MDEBUG(("%d: successful\n", t->id));
4815 tgl 874 CBC 5911 : return REG_OKAY;
4815 tgl 875 ECB : }
876 : /* Reset left's matches (right should have done so itself) */
594 tgl 877 CBC 248 : zaptreesubs(v, left);
4815 tgl 878 ECB : }
4062 tgl 879 GIC 298 : if (er != REG_NOMATCH)
4815 tgl 880 UIC 0 : return er;
881 : }
2746 tgl 882 CBC 1861 : NOERR();
883 :
884 : /* that midpoint didn't work, find a new one */
7188 bruce 885 1861 : if (mid == begin)
886 : {
7368 tgl 887 ECB : /* all possibilities exhausted */
781 tgl 888 EUB : MDEBUG(("%d: no midpoint\n", t->id));
7368 tgl 889 GIC 75 : return REG_NOMATCH;
7368 tgl 890 ECB : }
7188 bruce 891 GIC 1786 : mid = longest(v, d, begin, mid - 1, (int *) NULL);
2746 tgl 892 1786 : NOERR();
7188 bruce 893 CBC 1786 : if (mid == NULL)
894 : {
895 : /* failed to find a new one */
896 : MDEBUG(("%d: failed midpoint\n", t->id));
7368 tgl 897 264 : return REG_NOMATCH;
898 : }
4062 tgl 899 ECB : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
7368 900 : }
901 :
902 : /* can't get here */
903 : return REG_ASSERT;
904 : }
905 :
906 : /*
907 : * crevcondissect - dissect match for concatenation node, shortest-first
908 : */
909 : static int /* regexec return code */
2118 tgl 910 GIC 164 : crevcondissect(struct vars *v,
911 : struct subre *t,
912 : chr *begin, /* beginning of relevant substring */
913 : chr *end) /* end of same */
914 : {
778 915 164 : struct subre *left = t->child;
916 164 : struct subre *right = left->sibling;
917 : struct dfa *d;
7368 tgl 918 ECB : struct dfa *d2;
919 : chr *mid;
920 : int er;
921 :
7368 tgl 922 GIC 164 : assert(t->op == '.');
778 tgl 923 CBC 164 : assert(left != NULL && left->cnfa.nstates > 0);
924 164 : assert(right != NULL && right->cnfa.nstates > 0);
778 tgl 925 GIC 164 : assert(right->sibling == NULL);
926 164 : assert(left->flags & SHORTER);
927 :
928 164 : d = getsubdfa(v, left);
4062 929 164 : NOERR();
778 tgl 930 CBC 164 : d2 = getsubdfa(v, right);
4062 931 164 : NOERR();
781 tgl 932 ECB : MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
7368 933 :
934 : /* pick a tentative midpoint */
4062 tgl 935 GIC 164 : mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
2746 tgl 936 CBC 164 : NOERR();
4062 937 164 : if (mid == NULL)
4062 tgl 938 LBC 0 : return REG_NOMATCH;
781 tgl 939 ECB : MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
940 :
941 : /* iterate until satisfaction or failure */
942 : for (;;)
7188 bruce 943 : {
7368 tgl 944 : /* try this midpoint on for size */
4815 tgl 945 CBC 605 : if (longest(v, d2, mid, end, (int *) NULL) == end)
7188 bruce 946 EUB : {
778 tgl 947 GIC 93 : er = cdissect(v, left, begin, mid);
4815 948 93 : if (er == REG_OKAY)
949 : {
778 950 93 : er = cdissect(v, right, mid, end);
4815 951 93 : if (er == REG_OKAY)
952 : {
4815 tgl 953 ECB : /* satisfaction */
954 : MDEBUG(("%d: successful\n", t->id));
4815 tgl 955 CBC 81 : return REG_OKAY;
4815 tgl 956 ECB : }
957 : /* Reset left's matches (right should have done so itself) */
594 tgl 958 CBC 12 : zaptreesubs(v, left);
4815 tgl 959 ECB : }
4062 tgl 960 GIC 12 : if (er != REG_NOMATCH)
4815 tgl 961 UIC 0 : return er;
962 : }
2746 tgl 963 CBC 524 : NOERR();
964 :
965 : /* that midpoint didn't work, find a new one */
7188 bruce 966 524 : if (mid == end)
967 : {
7368 tgl 968 ECB : /* all possibilities exhausted */
781 tgl 969 EUB : MDEBUG(("%d: no midpoint\n", t->id));
7368 tgl 970 GIC 83 : return REG_NOMATCH;
7368 tgl 971 ECB : }
7188 bruce 972 GIC 441 : mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
2746 tgl 973 441 : NOERR();
7188 bruce 974 CBC 441 : if (mid == NULL)
975 : {
976 : /* failed to find a new one */
977 : MDEBUG(("%d: failed midpoint\n", t->id));
7368 tgl 978 LBC 0 : return REG_NOMATCH;
979 : }
4062 tgl 980 ECB : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
7368 981 : }
982 :
983 : /* can't get here */
984 : return REG_ASSERT;
985 : }
7368 tgl 986 EUB :
987 : /*
988 : * cbrdissect - dissect match for backref node
989 : *
990 : * The backref match might already have been verified by dfa_backref(),
991 : * but we don't know that for sure so must check it here.
992 : */
993 : static int /* regexec return code */
2118 tgl 994 GIC 209 : cbrdissect(struct vars *v,
995 : struct subre *t,
996 : chr *begin, /* beginning of relevant substring */
997 : chr *end) /* end of same */
998 : {
778 999 209 : int n = t->backno;
1000 : size_t numreps;
1001 : size_t tlen;
4066 tgl 1002 ECB : size_t brlen;
1003 : chr *brstring;
1004 : chr *p;
7188 bruce 1005 GIC 209 : int min = t->min;
1006 209 : int max = t->max;
7368 tgl 1007 ECB :
7368 tgl 1008 GIC 209 : assert(t != NULL);
1009 209 : assert(t->op == 'b');
1010 209 : assert(n >= 0);
7188 bruce 1011 209 : assert((size_t) n < v->nmatch);
1012 :
781 tgl 1013 ECB : MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
1014 : LOFF(begin), LOFF(end)));
1015 :
4066 1016 : /* get the backreferenced string */
7368 tgl 1017 CBC 209 : if (v->pmatch[n].rm_so == -1)
1018 55 : return REG_NOMATCH;
4066 1019 154 : brstring = v->start + v->pmatch[n].rm_so;
4066 tgl 1020 GIC 154 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
1021 :
1022 : /* special cases for zero-length strings */
1023 154 : if (brlen == 0)
1024 : {
4066 tgl 1025 ECB : /*
1026 : * matches only if target is zero length, but any number of
1027 : * repetitions can be considered to be present
1028 : */
4066 tgl 1029 GIC 7 : if (begin == end && min <= max)
1030 : {
781 tgl 1031 ECB : MDEBUG(("%d: backref matched trivially\n", t->id));
4066 tgl 1032 GIC 7 : return REG_OKAY;
1033 : }
4066 tgl 1034 UIC 0 : return REG_NOMATCH;
1035 : }
4066 tgl 1036 GIC 147 : if (begin == end)
7188 bruce 1037 ECB : {
1038 : /* matches only if zero repetitions are okay */
4066 tgl 1039 GIC 6 : if (min == 0)
4066 tgl 1040 ECB : {
1041 : MDEBUG(("%d: backref matched trivially\n", t->id));
7368 tgl 1042 GBC 5 : return REG_OKAY;
1043 : }
7368 tgl 1044 CBC 1 : return REG_NOMATCH;
1045 : }
1046 :
4066 tgl 1047 ECB : /*
1048 : * check target length to see if it could possibly be an allowed number of
1049 : * repetitions of brstring
1050 : */
4066 tgl 1051 GIC 141 : assert(end > begin);
4066 tgl 1052 CBC 141 : tlen = end - begin;
4066 tgl 1053 GIC 141 : if (tlen % brlen != 0)
1054 5 : return REG_NOMATCH;
1055 136 : numreps = tlen / brlen;
2762 1056 136 : if (numreps < min || (numreps > max && max != DUPINF))
7368 1057 3 : return REG_NOMATCH;
1058 :
4066 tgl 1059 ECB : /* okay, compare the actual string contents */
4066 tgl 1060 CBC 133 : p = begin;
1061 236 : while (numreps-- > 0)
7188 bruce 1062 ECB : {
4066 tgl 1063 CBC 151 : if ((*v->g->compare) (brstring, p, brlen) != 0)
1064 48 : return REG_NOMATCH;
1065 103 : p += brlen;
1066 : }
1067 :
781 tgl 1068 ECB : MDEBUG(("%d: backref matched\n", t->id));
4066 tgl 1069 CBC 85 : return REG_OKAY;
1070 : }
7368 tgl 1071 ECB :
1072 : /*
4062 1073 : * caltdissect - dissect match for alternation node
1074 : */
1075 : static int /* regexec return code */
2118 tgl 1076 GIC 68 : caltdissect(struct vars *v,
2118 tgl 1077 ECB : struct subre *t,
1078 : chr *begin, /* beginning of relevant substring */
1079 : chr *end) /* end of same */
1080 : {
1081 : struct dfa *d;
1082 : int er;
1083 :
778 tgl 1084 CBC 68 : assert(t->op == '|');
1085 :
778 tgl 1086 GIC 68 : t = t->child;
1087 : /* there should be at least 2 alternatives */
1088 68 : assert(t != NULL && t->sibling != NULL);
1089 :
4062 1090 125 : while (t != NULL)
1091 : {
778 tgl 1092 CBC 99 : assert(t->cnfa.nstates > 0);
1093 :
781 tgl 1094 ECB : MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1095 :
778 tgl 1096 CBC 99 : d = getsubdfa(v, t);
4062 tgl 1097 GIC 99 : NOERR();
4062 tgl 1098 CBC 99 : if (longest(v, d, begin, end, (int *) NULL) == end)
1099 : {
781 tgl 1100 ECB : MDEBUG(("%d: caltdissect matched\n", t->id));
778 tgl 1101 GIC 76 : er = cdissect(v, t, begin, end);
4062 1102 76 : if (er != REG_NOMATCH)
1103 42 : return er;
4062 tgl 1104 ECB : }
2746 tgl 1105 CBC 57 : NOERR();
7368 tgl 1106 ECB :
778 tgl 1107 GIC 57 : t = t->sibling;
1108 : }
7368 tgl 1109 ECB :
4062 tgl 1110 CBC 26 : return REG_NOMATCH;
7368 tgl 1111 ECB : }
1112 :
4062 1113 : /*
1114 : * citerdissect - dissect match for iteration node
1115 : */
1116 : static int /* regexec return code */
2118 tgl 1117 GIC 105 : citerdissect(struct vars *v,
2118 tgl 1118 ECB : struct subre *t,
1119 : chr *begin, /* beginning of relevant substring */
1120 : chr *end) /* end of same */
1121 : {
1122 : struct dfa *d;
1123 : chr **endpts;
1124 : chr *limit;
4062 1125 : int min_matches;
1126 : size_t max_matches;
1127 : int nverified;
1128 : int k;
1129 : int i;
1130 : int er;
1131 :
4062 tgl 1132 GIC 105 : assert(t->op == '*');
778 1133 105 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1134 105 : assert(!(t->child->flags & SHORTER));
4062 1135 105 : assert(begin <= end);
1136 :
1137 : MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1138 :
1139 : /*
2746 tgl 1140 ECB : * For the moment, assume the minimum number of matches is 1. If zero
1141 : * matches are allowed, and the target string is empty, we are allowed to
1142 : * match regardless of the contents of the iter node --- but we would
1143 : * prefer to match once, so that capturing parens get set. (An example of
1144 : * the concern here is a pattern like "()*\1", which historically this
1145 : * code has allowed to succeed.) Therefore, we deal with the zero-matches
1146 : * case at the bottom, after failing to find any other way to match.
1147 : */
4062 tgl 1148 GIC 105 : min_matches = t->min;
1149 105 : if (min_matches <= 0)
1150 69 : min_matches = 1;
1151 :
1152 : /*
1153 : * We need workspace to track the endpoints of each sub-match. Normally
1154 : * we consider only nonzero-length sub-matches, so there can be at most
1155 : * end-begin of them. However, if min is larger than that, we will also
4062 tgl 1156 ECB : * consider zero-length sub-matches in order to find enough matches.
1157 : *
1158 : * For convenience, endpts[0] contains the "begin" pointer and we store
1159 : * sub-match endpoints in endpts[1..max_matches].
1160 : */
4062 tgl 1161 GIC 105 : max_matches = end - begin;
2762 1162 105 : if (max_matches > t->max && t->max != DUPINF)
4062 tgl 1163 UIC 0 : max_matches = t->max;
4062 tgl 1164 GIC 105 : if (max_matches < min_matches)
1165 61 : max_matches = min_matches;
1166 105 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1167 105 : if (endpts == NULL)
4062 tgl 1168 UIC 0 : return REG_ESPACE;
4062 tgl 1169 CBC 105 : endpts[0] = begin;
4062 tgl 1170 ECB :
778 tgl 1171 GBC 105 : d = getsubdfa(v, t->child);
4062 tgl 1172 CBC 105 : if (ISERR())
4062 tgl 1173 ECB : {
4062 tgl 1174 LBC 0 : FREE(endpts);
1175 0 : return v->err;
4062 tgl 1176 EUB : }
4062 tgl 1177 ECB :
1178 : /*
1179 : * Our strategy is to first find a set of sub-match endpoints that are
1180 : * valid according to the child node's DFA, and then recursively dissect
1181 : * each sub-match to confirm validity. If any validity check fails,
597 tgl 1182 EUB : * backtrack that sub-match and try again. And, when we next try for a
1183 : * validity check, we need not recheck any successfully verified
1184 : * sub-matches that we didn't move the endpoints of. nverified remembers
1185 : * how many sub-matches are currently known okay.
1186 : */
1187 :
1188 : /* initialize to consider first sub-match */
4062 tgl 1189 GIC 105 : nverified = 0;
1190 105 : k = 1;
1191 105 : limit = end;
1192 :
1193 : /* iterate until satisfaction or failure */
1194 479 : while (k > 0)
1195 : {
1196 : /* try to find an endpoint for the k'th sub-match */
4062 tgl 1197 CBC 395 : endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
2746 1198 395 : if (ISERR())
2746 tgl 1199 ECB : {
2746 tgl 1200 UIC 0 : FREE(endpts);
1201 0 : return v->err;
2746 tgl 1202 ECB : }
4062 tgl 1203 GIC 395 : if (endpts[k] == NULL)
1204 : {
4062 tgl 1205 ECB : /* no match possible, so see if we can shorten previous one */
4062 tgl 1206 CBC 200 : k--;
4062 tgl 1207 GIC 200 : goto backtrack;
4062 tgl 1208 EUB : }
1209 : MDEBUG(("%d: working endpoint %d: %ld\n",
1210 : t->id, k, LOFF(endpts[k])));
4062 tgl 1211 ECB :
1212 : /* k'th sub-match can no longer be considered verified */
4062 tgl 1213 GIC 195 : if (nverified >= k)
4062 tgl 1214 CBC 8 : nverified = k - 1;
4062 tgl 1215 ECB :
4062 tgl 1216 GIC 195 : if (endpts[k] != end)
1217 : {
1218 : /* haven't reached end yet, try another iteration if allowed */
1219 134 : if (k >= max_matches)
1220 : {
4062 tgl 1221 ECB : /* must try to shorten some previous match */
4062 tgl 1222 LBC 0 : k--;
4062 tgl 1223 UIC 0 : goto backtrack;
4062 tgl 1224 ECB : }
1225 :
1226 : /* reject zero-length match unless necessary to achieve min */
4062 tgl 1227 CBC 134 : if (endpts[k] == endpts[k - 1] &&
4062 tgl 1228 UIC 0 : (k >= min_matches || min_matches - k < end - endpts[k]))
1229 0 : goto backtrack;
4062 tgl 1230 EUB :
4062 tgl 1231 GBC 134 : k++;
4062 tgl 1232 GIC 134 : limit = end;
1233 134 : continue;
1234 : }
4062 tgl 1235 ECB :
4062 tgl 1236 EUB : /*
3955 bruce 1237 : * We've identified a way to divide the string into k sub-matches that
1238 : * works so far as the child DFA can tell. If k is an allowed number
3955 bruce 1239 ECB : * of matches, start the slow part: recurse to verify each sub-match.
1240 : * We always have k <= max_matches, needn't check that.
4062 tgl 1241 : */
4062 tgl 1242 GIC 61 : if (k < min_matches)
4062 tgl 1243 UIC 0 : goto backtrack;
1244 :
1245 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1246 :
4062 tgl 1247 GIC 100 : for (i = nverified + 1; i <= k; i++)
1248 : {
1249 : /* zap any match data from a non-last iteration */
778 tgl 1250 CBC 79 : zaptreesubs(v, t->child);
778 tgl 1251 GBC 79 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
4062 tgl 1252 GIC 79 : if (er == REG_OKAY)
1253 : {
1254 39 : nverified = i;
4062 tgl 1255 CBC 39 : continue;
1256 : }
4062 tgl 1257 GIC 40 : if (er == REG_NOMATCH)
4062 tgl 1258 CBC 40 : break;
4062 tgl 1259 ECB : /* oops, something failed */
4062 tgl 1260 LBC 0 : FREE(endpts);
4062 tgl 1261 UIC 0 : return er;
4062 tgl 1262 ECB : }
1263 :
4062 tgl 1264 GIC 61 : if (i > k)
4062 tgl 1265 ECB : {
1266 : /* satisfaction */
1267 : MDEBUG(("%d: successful\n", t->id));
4062 tgl 1268 GBC 21 : FREE(endpts);
1269 21 : return REG_OKAY;
1270 : }
1271 :
597 tgl 1272 ECB : /* i'th match failed to verify, so backtrack it */
597 tgl 1273 GIC 40 : k = i;
1274 :
4062 1275 240 : backtrack:
3955 bruce 1276 ECB :
4062 tgl 1277 : /*
1278 : * Must consider shorter versions of the k'th sub-match. However,
1279 : * we'll only ask for a zero-length match if necessary.
1280 : */
4062 tgl 1281 CBC 240 : while (k > 0)
1282 : {
3955 bruce 1283 156 : chr *prev_end = endpts[k - 1];
1284 :
4062 tgl 1285 GIC 156 : if (endpts[k] > prev_end)
1286 : {
1287 156 : limit = endpts[k] - 1;
1288 156 : if (limit > prev_end ||
4062 tgl 1289 LBC 0 : (k < min_matches && min_matches - k >= end - prev_end))
1290 : {
4062 tgl 1291 ECB : /* break out of backtrack loop, continue the outer one */
1292 : break;
1293 : }
1294 : }
1295 : /* can't shorten k'th sub-match any more, consider previous one */
4062 tgl 1296 LBC 0 : k--;
4062 tgl 1297 EUB : }
1298 : }
1299 :
1300 : /* all possibilities exhausted */
4062 tgl 1301 GIC 84 : FREE(endpts);
1302 :
1303 : /*
2746 tgl 1304 EUB : * Now consider the possibility that we can match to a zero-length string
1305 : * by using zero repetitions.
1306 : */
2746 tgl 1307 GIC 84 : if (t->min == 0 && begin == end)
1308 : {
781 tgl 1309 ECB : MDEBUG(("%d: allowing zero matches\n", t->id));
2746 tgl 1310 GIC 56 : return REG_OKAY;
1311 : }
1312 :
1313 : MDEBUG(("%d: failed\n", t->id));
4062 1314 28 : return REG_NOMATCH;
4062 tgl 1315 ECB : }
1316 :
1317 : /*
1318 : * creviterdissect - dissect match for iteration node, shortest-first
1319 : */
1320 : static int /* regexec return code */
2118 tgl 1321 GIC 7 : creviterdissect(struct vars *v,
2118 tgl 1322 ECB : struct subre *t,
1323 : chr *begin, /* beginning of relevant substring */
1324 : chr *end) /* end of same */
1325 : {
1326 : struct dfa *d;
1327 : chr **endpts;
1328 : chr *limit;
4062 1329 : int min_matches;
1330 : size_t max_matches;
1331 : int nverified;
1332 : int k;
1333 : int i;
1334 : int er;
1335 :
4062 tgl 1336 GIC 7 : assert(t->op == '*');
778 1337 7 : assert(t->child != NULL && t->child->cnfa.nstates > 0);
1338 7 : assert(t->child->flags & SHORTER);
4062 1339 7 : assert(begin <= end);
1340 :
1341 : MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
1342 :
1343 : /*
4062 tgl 1344 ECB : * If zero matches are allowed, and target string is empty, just declare
1345 : * victory. OTOH, if target string isn't empty, zero matches can't work
1346 : * so we pretend the min is 1.
1347 : */
4062 tgl 1348 GIC 7 : min_matches = t->min;
1349 7 : if (min_matches <= 0)
1350 : {
1351 7 : if (begin == end)
1352 : {
1353 : MDEBUG(("%d: allowing zero matches\n", t->id));
1354 1 : return REG_OKAY;
1355 : }
4062 tgl 1356 CBC 6 : min_matches = 1;
4062 tgl 1357 ECB : }
1358 :
1359 : /*
1360 : * We need workspace to track the endpoints of each sub-match. Normally
1361 : * we consider only nonzero-length sub-matches, so there can be at most
1362 : * end-begin of them. However, if min is larger than that, we will also
1363 : * consider zero-length sub-matches in order to find enough matches.
1364 : *
1365 : * For convenience, endpts[0] contains the "begin" pointer and we store
1366 : * sub-match endpoints in endpts[1..max_matches].
1367 : */
4062 tgl 1368 GIC 6 : max_matches = end - begin;
2762 1369 6 : if (max_matches > t->max && t->max != DUPINF)
4062 1370 6 : max_matches = t->max;
1371 6 : if (max_matches < min_matches)
4062 tgl 1372 UIC 0 : max_matches = min_matches;
4062 tgl 1373 GIC 6 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1374 6 : if (endpts == NULL)
4062 tgl 1375 UIC 0 : return REG_ESPACE;
4062 tgl 1376 CBC 6 : endpts[0] = begin;
4062 tgl 1377 ECB :
778 tgl 1378 CBC 6 : d = getsubdfa(v, t->child);
4062 1379 6 : if (ISERR())
4062 tgl 1380 EUB : {
4062 tgl 1381 LBC 0 : FREE(endpts);
1382 0 : return v->err;
4062 tgl 1383 EUB : }
4062 tgl 1384 ECB :
1385 : /*
1386 : * Our strategy is to first find a set of sub-match endpoints that are
1387 : * valid according to the child node's DFA, and then recursively dissect
1388 : * each sub-match to confirm validity. If any validity check fails,
597 tgl 1389 EUB : * backtrack that sub-match and try again. And, when we next try for a
1390 : * validity check, we need not recheck any successfully verified
1391 : * sub-matches that we didn't move the endpoints of. nverified remembers
1392 : * how many sub-matches are currently known okay.
1393 : */
1394 :
1395 : /* initialize to consider first sub-match */
4062 tgl 1396 GIC 6 : nverified = 0;
1397 6 : k = 1;
1398 6 : limit = begin;
1399 :
1400 : /* iterate until satisfaction or failure */
1401 8 : while (k > 0)
1402 : {
1403 : /* disallow zero-length match unless necessary to achieve min */
4062 tgl 1404 CBC 6 : if (limit == endpts[k - 1] &&
1405 6 : limit != end &&
4062 tgl 1406 LBC 0 : (k >= min_matches || min_matches - k < end - limit))
4062 tgl 1407 GIC 6 : limit++;
1408 :
3120 tgl 1409 ECB : /* if this is the last allowed sub-match, it must reach to the end */
3120 tgl 1410 GIC 6 : if (k >= max_matches)
1411 6 : limit = end;
3120 tgl 1412 ECB :
4062 1413 : /* try to find an endpoint for the k'th sub-match */
4062 tgl 1414 GBC 6 : endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
4062 tgl 1415 ECB : (chr **) NULL, (int *) NULL);
2746 tgl 1416 GIC 6 : if (ISERR())
1417 : {
2746 tgl 1418 LBC 0 : FREE(endpts);
1419 0 : return v->err;
1420 : }
4062 tgl 1421 GIC 6 : if (endpts[k] == NULL)
4062 tgl 1422 ECB : {
1423 : /* no match possible, so see if we can lengthen previous one */
4062 tgl 1424 LBC 0 : k--;
4062 tgl 1425 UIC 0 : goto backtrack;
4062 tgl 1426 EUB : }
1427 : MDEBUG(("%d: working endpoint %d: %ld\n",
1428 : t->id, k, LOFF(endpts[k])));
4062 tgl 1429 ECB :
1430 : /* k'th sub-match can no longer be considered verified */
4062 tgl 1431 GIC 6 : if (nverified >= k)
4062 tgl 1432 UBC 0 : nverified = k - 1;
4062 tgl 1433 EUB :
4062 tgl 1434 GIC 6 : if (endpts[k] != end)
1435 : {
1436 : /* haven't reached end yet, try another iteration if allowed */
4062 tgl 1437 UIC 0 : if (k >= max_matches)
1438 : {
4062 tgl 1439 ECB : /* must try to lengthen some previous match */
4062 tgl 1440 UBC 0 : k--;
4062 tgl 1441 UIC 0 : goto backtrack;
4062 tgl 1442 ECB : }
1443 :
4062 tgl 1444 UIC 0 : k++;
4062 tgl 1445 UBC 0 : limit = endpts[k - 1];
4062 tgl 1446 UIC 0 : continue;
1447 : }
4062 tgl 1448 EUB :
1449 : /*
1450 : * We've identified a way to divide the string into k sub-matches that
1451 : * works so far as the child DFA can tell. If k is an allowed number
3955 bruce 1452 : * of matches, start the slow part: recurse to verify each sub-match.
1453 : * We always have k <= max_matches, needn't check that.
4062 tgl 1454 : */
4062 tgl 1455 GIC 6 : if (k < min_matches)
4062 tgl 1456 UIC 0 : goto backtrack;
1457 :
1458 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1459 :
4062 tgl 1460 GIC 10 : for (i = nverified + 1; i <= k; i++)
1461 : {
1462 : /* zap any match data from a non-last iteration */
778 tgl 1463 CBC 6 : zaptreesubs(v, t->child);
778 tgl 1464 GBC 6 : er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
4062 tgl 1465 GIC 6 : if (er == REG_OKAY)
1466 : {
1467 4 : nverified = i;
4062 tgl 1468 CBC 4 : continue;
1469 : }
4062 tgl 1470 GIC 2 : if (er == REG_NOMATCH)
4062 tgl 1471 CBC 2 : break;
4062 tgl 1472 ECB : /* oops, something failed */
4062 tgl 1473 LBC 0 : FREE(endpts);
4062 tgl 1474 UIC 0 : return er;
4062 tgl 1475 ECB : }
1476 :
4062 tgl 1477 GIC 6 : if (i > k)
4062 tgl 1478 ECB : {
1479 : /* satisfaction */
1480 : MDEBUG(("%d: successful\n", t->id));
4062 tgl 1481 GBC 4 : FREE(endpts);
1482 4 : return REG_OKAY;
1483 : }
1484 :
597 tgl 1485 ECB : /* i'th match failed to verify, so backtrack it */
597 tgl 1486 GIC 2 : k = i;
1487 :
4062 1488 2 : backtrack:
3955 bruce 1489 ECB :
4062 tgl 1490 : /*
1491 : * Must consider longer versions of the k'th sub-match.
1492 : */
4062 tgl 1493 GIC 4 : while (k > 0)
4062 tgl 1494 ECB : {
4062 tgl 1495 GIC 2 : if (endpts[k] < end)
4062 tgl 1496 ECB : {
4062 tgl 1497 UIC 0 : limit = endpts[k] + 1;
1498 : /* break out of backtrack loop, continue the outer one */
1499 0 : break;
1500 : }
4062 tgl 1501 ECB : /* can't lengthen k'th sub-match any more, consider previous one */
4062 tgl 1502 GIC 2 : k--;
4062 tgl 1503 ECB : }
1504 : }
4062 tgl 1505 EUB :
1506 : /* all possibilities exhausted */
781 1507 : MDEBUG(("%d: failed\n", t->id));
4062 tgl 1508 GIC 2 : FREE(endpts);
1509 2 : return REG_NOMATCH;
4062 tgl 1510 ECB : }
1511 :
1512 :
1513 :
1514 : #include "rege_dfa.c"
|