LCOV - differential code coverage report
Current view: top level - src/backend/regex - regexec.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB
Current: Differential Code Coverage HEAD vs 15 Lines: 86.3 % 553 477 32 39 5 26 265 2 184 45 261
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 16 16 16 16
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*
       2                 :  * re_*exec and friends - match REs
       3                 :  *
       4                 :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       5                 :  *
       6                 :  * Development of this software was funded, in part, by Cray Research Inc.,
       7                 :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       8                 :  * Corporation, none of whom are responsible for the results.  The author
       9                 :  * thanks all of them.
      10                 :  *
      11                 :  * Redistribution and use in source and binary forms -- with or without
      12                 :  * modification -- are permitted for any purpose, provided that
      13                 :  * redistributions in source form retain this entire copyright notice and
      14                 :  * indicate the origin and nature of any modifications.
      15                 :  *
      16                 :  * I'd appreciate being given credit for this package in the documentation
      17                 :  * of software which uses it, but that is not a requirement.
      18                 :  *
      19                 :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      20                 :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      21                 :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      22                 :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      23                 :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      24                 :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      25                 :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      26                 :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      27                 :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      28                 :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      29                 :  *
      30                 :  * src/backend/regex/regexec.c
      31                 :  *
      32                 :  */
      33                 : 
      34                 : #include "regex/regguts.h"
      35                 : 
      36                 : 
      37                 : 
      38                 : /* lazy-DFA representation */
      39                 : struct arcp
      40                 : {                               /* "pointer" to an outarc */
      41                 :     struct sset *ss;
      42                 :     color       co;
      43                 : };
      44                 : 
      45                 : struct sset
      46                 : {                               /* state set */
      47                 :     unsigned   *states;         /* pointer to bitvector */
      48                 :     unsigned    hash;           /* hash of bitvector */
      49                 : #define  HASH(bv, nw)    (((nw) == 1) ? *(bv) : hash(bv, nw))
      50                 : #define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
      51                 :         memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
      52                 :     int         flags;
      53                 : #define  STARTER     01         /* the initial state set */
      54                 : #define  POSTSTATE   02         /* includes the goal state */
      55                 : #define  LOCKED      04         /* locked in cache */
      56                 : #define  NOPROGRESS  010        /* zero-progress state set */
      57                 :     struct arcp ins;            /* chain of inarcs pointing here */
      58                 :     chr        *lastseen;       /* last entered on arrival here */
      59                 :     struct sset **outs;         /* outarc vector indexed by color */
      60                 :     struct arcp *inchain;       /* chain-pointer vector for outarcs */
      61                 : };
      62                 : 
      63                 : struct dfa
      64                 : {
      65                 :     int         nssets;         /* size of cache */
      66                 :     int         nssused;        /* how many entries occupied yet */
      67                 :     int         nstates;        /* number of states */
      68                 :     int         ncolors;        /* length of outarc and inchain vectors */
      69                 :     int         wordsper;       /* length of state-set bitvectors */
      70                 :     struct sset *ssets;         /* state-set cache */
      71                 :     unsigned   *statesarea;     /* bitvector storage */
      72                 :     unsigned   *work;           /* pointer to work area within statesarea */
      73                 :     struct sset **outsarea;     /* outarc-vector storage */
      74                 :     struct arcp *incarea;       /* inchain storage */
      75                 :     struct cnfa *cnfa;
      76                 :     struct colormap *cm;
      77                 :     chr        *lastpost;       /* location of last cache-flushed success */
      78                 :     chr        *lastnopr;       /* location of last cache-flushed NOPROGRESS */
      79                 :     struct sset *search;        /* replacement-search-pointer memory */
      80                 :     int         backno;         /* if DFA for a backref, subno it refers to */
      81                 :     short       backmin;        /* min repetitions for backref */
      82                 :     short       backmax;        /* max repetitions for backref */
      83                 :     bool        ismalloced;     /* should this struct dfa be freed? */
      84                 :     bool        arraysmalloced; /* should its subsidiary arrays be freed? */
      85                 : };
      86                 : 
      87                 : #define WORK    1               /* number of work bitvectors needed */
      88                 : 
      89                 : /* setup for non-malloc allocation for small cases */
      90                 : #define FEWSTATES   20          /* must be less than UBITS */
      91                 : #define FEWCOLORS   15
      92                 : struct smalldfa
      93                 : {
      94                 :     struct dfa  dfa;            /* must be first */
      95                 :     struct sset ssets[FEWSTATES * 2];
      96                 :     unsigned    statesarea[FEWSTATES * 2 + WORK];
      97                 :     struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
      98                 :     struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
      99                 : };
     100                 : 
     101                 : #define DOMALLOC    ((struct smalldfa *)NULL)   /* force malloc */
     102                 : 
     103                 : 
     104                 : 
     105                 : /* internal variables, bundled for easy passing around */
     106                 : struct vars
     107                 : {
     108                 :     regex_t    *re;
     109                 :     struct guts *g;
     110                 :     int         eflags;         /* copies of arguments */
     111                 :     size_t      nmatch;
     112                 :     regmatch_t *pmatch;
     113                 :     rm_detail_t *details;
     114                 :     chr        *start;          /* start of string */
     115                 :     chr        *search_start;   /* search start of string */
     116                 :     chr        *stop;           /* just past end of string */
     117                 :     int         err;            /* error code if any (0 none) */
     118                 :     struct dfa **subdfas;       /* per-tree-subre DFAs */
     119                 :     struct dfa **ladfas;        /* per-lacon-subre DFAs */
     120                 :     struct sset **lblastcss;    /* per-lacon-subre lookbehind restart data */
     121                 :     chr       **lblastcp;       /* per-lacon-subre lookbehind restart data */
     122                 :     struct smalldfa dfa1;
     123                 :     struct smalldfa dfa2;
     124                 : };
     125                 : 
     126                 : #define VISERR(vv)  ((vv)->err != 0) /* have we seen an error yet? */
     127                 : #define ISERR() VISERR(v)
     128                 : #define VERR(vv,e)  ((vv)->err = ((vv)->err ? (vv)->err : (e)))
     129                 : #define ERR(e)  VERR(v, e)      /* record an error */
     130                 : #define NOERR() {if (ISERR()) return v->err;}    /* if error seen, return it */
     131                 : #define OFF(p)  ((p) - v->start)
     132                 : #define LOFF(p) ((long)OFF(p))
     133                 : 
     134                 : 
     135                 : 
     136                 : /*
     137                 :  * forward declarations
     138                 :  */
     139                 : /* === regexec.c === */
     140                 : static struct dfa *getsubdfa(struct vars *v, struct subre *t);
     141                 : static struct dfa *getladfa(struct vars *v, int n);
     142                 : static int  find(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
     143                 : static int  cfind(struct vars *v, struct cnfa *cnfa, struct colormap *cm);
     144                 : static int  cfindloop(struct vars *v, struct cnfa *cnfa, struct colormap *cm,
     145                 :                       struct dfa *d, struct dfa *s, chr **coldp);
     146                 : static void zapallsubs(regmatch_t *p, size_t n);
     147                 : static void zaptreesubs(struct vars *v, struct subre *t);
     148                 : static void subset(struct vars *v, struct subre *sub, chr *begin, chr *end);
     149                 : static int  cdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     150                 : static int  ccondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     151                 : static int  crevcondissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     152                 : static int  cbrdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     153                 : static int  caltdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     154                 : static int  citerdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     155                 : static int  creviterdissect(struct vars *v, struct subre *t, chr *begin, chr *end);
     156                 : 
     157                 : /* === rege_dfa.c === */
     158                 : static chr *longest(struct vars *v, struct dfa *d,
     159                 :                     chr *start, chr *stop, int *hitstopp);
     160                 : static chr *shortest(struct vars *v, struct dfa *d, chr *start, chr *min,
     161                 :                      chr *max, chr **coldp, int *hitstopp);
     162                 : static int  matchuntil(struct vars *v, struct dfa *d, chr *probe,
     163                 :                        struct sset **lastcss, chr **lastcp);
     164                 : static chr *dfa_backref(struct vars *v, struct dfa *d, chr *start,
     165                 :                         chr *min, chr *max, bool shortest);
     166                 : static chr *lastcold(struct vars *v, struct dfa *d);
     167                 : static struct dfa *newdfa(struct vars *v, struct cnfa *cnfa,
     168                 :                           struct colormap *cm, struct smalldfa *sml);
     169                 : static void freedfa(struct dfa *d);
     170                 : static unsigned hash(unsigned *uv, int n);
     171                 : static struct sset *initialize(struct vars *v, struct dfa *d, chr *start);
     172                 : static struct sset *miss(struct vars *v, struct dfa *d, struct sset *css,
     173                 :                          color co, chr *cp, chr *start);
     174                 : static int  lacon(struct vars *v, struct cnfa *pcnfa, chr *cp, color co);
     175                 : static struct sset *getvacant(struct vars *v, struct dfa *d, chr *cp,
     176                 :                               chr *start);
     177                 : static struct sset *pickss(struct vars *v, struct dfa *d, chr *cp,
     178                 :                            chr *start);
     179                 : 
     180                 : 
     181                 : /*
     182                 :  * pg_regexec - match regular expression
     183                 :  */
     184                 : int
     185 GIC      901896 : pg_regexec(regex_t *re,
     186                 :            const chr *string,
     187                 :            size_t len,
     188                 :            size_t search_start,
     189                 :            rm_detail_t *details,
     190                 :            size_t nmatch,
     191                 :            regmatch_t pmatch[],
     192                 :            int flags)
     193                 : {
     194 ECB             :     struct vars var;
     195 GNC      901896 :     struct vars *v = &var;
     196                 :     int         st;
     197                 :     size_t      n;
     198                 :     size_t      i;
     199                 :     int         backref;
     200                 : 
     201                 : #define  LOCALMAT    20
     202                 :     regmatch_t  mat[LOCALMAT];
     203                 : 
     204 ECB             : #define  LOCALDFAS   40
     205                 :     struct dfa *subdfas[LOCALDFAS];
     206                 : 
     207                 :     /* sanity checks */
     208 GIC      901896 :     if (re == NULL || string == NULL || re->re_magic != REMAGIC)
     209 UIC           0 :         return REG_INVARG;
     210 GIC      901896 :     if (re->re_csize != sizeof(chr))
     211 UIC           0 :         return REG_MIXED;
     212 GIC      901896 :     if (search_start > len)
     213               3 :         return REG_NOMATCH;
     214                 : 
     215                 :     /* Initialize locale-dependent support */
     216          901893 :     pg_set_regex_collation(re->re_collation);
     217 ECB             : 
     218 EUB             :     /* setup */
     219 CBC      901893 :     v->re = re;
     220 GBC      901893 :     v->g = (struct guts *) re->re_guts;
     221 CBC      901893 :     if ((v->g->cflags & REG_EXPECT) && details == NULL)
     222 LBC           0 :         return REG_INVARG;
     223 GIC      901893 :     if (v->g->info & REG_UIMPOSSIBLE)
     224             715 :         return REG_NOMATCH;
     225 CBC      901178 :     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
     226 GIC      901178 :     v->eflags = flags;
     227          901178 :     if (backref && nmatch <= v->g->nsub)
     228 ECB             :     {
     229                 :         /* need larger work area */
     230 CBC          91 :         v->nmatch = v->g->nsub + 1;
     231 GBC          91 :         if (v->nmatch <= LOCALMAT)
     232 CBC          90 :             v->pmatch = mat;
     233 ECB             :         else
     234 CBC           1 :             v->pmatch = (regmatch_t *) MALLOC(v->nmatch * sizeof(regmatch_t));
     235              91 :         if (v->pmatch == NULL)
     236 LBC           0 :             return REG_ESPACE;
     237 GIC          91 :         zapallsubs(v->pmatch, v->nmatch);
     238                 :     }
     239 ECB             :     else
     240                 :     {
     241                 :         /* we can store results directly in caller's array */
     242 GIC      901087 :         v->pmatch = pmatch;
     243 ECB             :         /* ensure any extra entries in caller's array are filled with -1 */
     244 CBC      901087 :         if (nmatch > 0)
     245 GBC      556472 :             zapallsubs(pmatch, nmatch);
     246 ECB             :         /* then forget about extra entries, to avoid useless work in find() */
     247 GIC      901087 :         if (nmatch > v->g->nsub + 1)
     248             677 :             nmatch = v->g->nsub + 1;
     249          901087 :         v->nmatch = nmatch;
     250                 :     }
     251 CBC      901178 :     v->details = details;
     252 GIC      901178 :     v->start = (chr *) string;
     253 CBC      901178 :     v->search_start = (chr *) string + search_start;
     254          901178 :     v->stop = (chr *) string + len;
     255 GIC      901178 :     v->err = 0;
     256 CBC      901178 :     v->subdfas = NULL;
     257          901178 :     v->ladfas = NULL;
     258          901178 :     v->lblastcss = NULL;
     259 GIC      901178 :     v->lblastcp = NULL;
     260 ECB             :     /* below this point, "goto cleanup" will behave sanely */
     261                 : 
     262 CBC      901178 :     assert(v->g->ntree >= 0);
     263          901178 :     n = (size_t) v->g->ntree;
     264          901178 :     if (n <= LOCALDFAS)
     265          901176 :         v->subdfas = subdfas;
     266 ECB             :     else
     267                 :     {
     268 CBC           2 :         v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     269 GIC           2 :         if (v->subdfas == NULL)
     270                 :         {
     271 LBC           0 :             st = REG_ESPACE;
     272               0 :             goto cleanup;
     273 ECB             :         }
     274                 :     }
     275 GIC     2719024 :     for (i = 0; i < n; i++)
     276         1817846 :         v->subdfas[i] = NULL;
     277 ECB             : 
     278 CBC      901178 :     assert(v->g->nlacons >= 0);
     279 GIC      901178 :     n = (size_t) v->g->nlacons;
     280 GBC      901178 :     if (n > 0)
     281 EUB             :     {
     282 GIC         630 :         v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     283             630 :         if (v->ladfas == NULL)
     284 ECB             :         {
     285 LBC           0 :             st = REG_ESPACE;
     286 UIC           0 :             goto cleanup;
     287 ECB             :         }
     288 CBC        1922 :         for (i = 0; i < n; i++)
     289            1292 :             v->ladfas[i] = NULL;
     290 GIC         630 :         v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
     291 CBC         630 :         v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
     292             630 :         if (v->lblastcss == NULL || v->lblastcp == NULL)
     293                 :         {
     294 UBC           0 :             st = REG_ESPACE;
     295               0 :             goto cleanup;
     296                 :         }
     297 CBC        1922 :         for (i = 0; i < n; i++)
     298 ECB             :         {
     299 CBC        1292 :             v->lblastcss[i] = NULL;
     300            1292 :             v->lblastcp[i] = NULL;
     301 ECB             :         }
     302                 :     }
     303 EUB             : 
     304                 :     /* do it */
     305 GIC      901178 :     assert(v->g->tree != NULL);
     306 CBC      901178 :     if (backref)
     307 GIC         145 :         st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
     308 ECB             :     else
     309 CBC      901033 :         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
     310                 : 
     311                 :     /* on success, ensure caller's match vector is filled correctly */
     312 GIC      901178 :     if (st == REG_OKAY && nmatch > 0)
     313                 :     {
     314 CBC      451389 :         if (v->pmatch != pmatch)
     315 ECB             :         {
     316                 :             /* copy portion of match vector over from (larger) work area */
     317 GIC          21 :             assert(nmatch <= v->nmatch);
     318 CBC          21 :             memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
     319                 :         }
     320 GIC      451389 :         if (v->g->cflags & REG_NOSUB)
     321 ECB             :         {
     322                 :             /* don't expose possibly-partial sub-match results to caller */
     323 CBC      448309 :             zapallsubs(pmatch, nmatch);
     324                 :         }
     325                 :     }
     326 ECB             : 
     327                 :     /* clean up */
     328 GIC      452869 : cleanup:
     329 CBC      901178 :     if (v->pmatch != pmatch && v->pmatch != mat)
     330 GIC           1 :         FREE(v->pmatch);
     331          901178 :     if (v->subdfas != NULL)
     332 ECB             :     {
     333 GIC      901178 :         n = (size_t) v->g->ntree;
     334         2719024 :         for (i = 0; i < n; i++)
     335                 :         {
     336         1817846 :             if (v->subdfas[i] != NULL)
     337 CBC       12368 :                 freedfa(v->subdfas[i]);
     338 ECB             :         }
     339 CBC      901178 :         if (v->subdfas != subdfas)
     340               2 :             FREE(v->subdfas);
     341                 :     }
     342          901178 :     if (v->ladfas != NULL)
     343 ECB             :     {
     344 GIC         630 :         n = (size_t) v->g->nlacons;
     345 CBC        1922 :         for (i = 0; i < n; i++)
     346 ECB             :         {
     347 GIC        1292 :             if (v->ladfas[i] != NULL)
     348 CBC          53 :                 freedfa(v->ladfas[i]);
     349 ECB             :         }
     350 GIC         630 :         FREE(v->ladfas);
     351 ECB             :     }
     352 GIC      901178 :     if (v->lblastcss != NULL)
     353 CBC         630 :         FREE(v->lblastcss);
     354          901178 :     if (v->lblastcp != NULL)
     355 GIC         630 :         FREE(v->lblastcp);
     356 ECB             : 
     357                 : #ifdef REG_DEBUG
     358                 :     if (v->eflags & (REG_FTRACE | REG_MTRACE))
     359                 :         fflush(stdout);
     360                 : #endif
     361                 : 
     362 CBC      901178 :     return st;
     363 ECB             : }
     364                 : 
     365                 : /*
     366                 :  * getsubdfa - create or re-fetch the DFA for a tree subre node
     367                 :  *
     368                 :  * We only need to create the DFA once per overall regex execution.
     369                 :  * The DFA will be freed by the cleanup step in pg_regexec().
     370                 :  */
     371                 : static struct dfa *
     372 GIC       13054 : getsubdfa(struct vars *v,
     373                 :           struct subre *t)
     374                 : {
     375           13054 :     struct dfa *d = v->subdfas[t->id];
     376                 : 
     377           13054 :     if (d == NULL)
     378                 :     {
     379           12368 :         d = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
     380           12368 :         if (d == NULL)
     381 LBC           0 :             return NULL;
     382                 :         /* set up additional info if this is a backref node */
     383 GIC       12368 :         if (t->op == 'b')
     384 ECB             :         {
     385 GIC         140 :             d->backno = t->backno;
     386 CBC         140 :             d->backmin = t->min;
     387 GIC         140 :             d->backmax = t->max;
     388 ECB             :         }
     389 CBC       12368 :         v->subdfas[t->id] = d;
     390 EUB             :     }
     391 GIC       13054 :     return d;
     392 ECB             : }
     393                 : 
     394                 : /*
     395                 :  * getladfa - create or re-fetch the DFA for a LACON subre node
     396                 :  *
     397                 :  * Same as above, but for LACONs.
     398                 :  */
     399                 : static struct dfa *
     400 CBC         120 : getladfa(struct vars *v,
     401                 :          int n)
     402                 : {
     403 GIC         120 :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     404                 : 
     405             120 :     if (v->ladfas[n] == NULL)
     406                 :     {
     407              53 :         struct subre *sub = &v->g->lacons[n];
     408                 : 
     409 CBC          53 :         v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
     410                 :         /* a LACON can't contain a backref, so nothing else to do */
     411                 :     }
     412             120 :     return v->ladfas[n];
     413                 : }
     414 ECB             : 
     415                 : /*
     416                 :  * find - find a match for the main NFA (no-complications case)
     417                 :  */
     418                 : static int
     419 GIC      901033 : find(struct vars *v,
     420                 :      struct cnfa *cnfa,
     421 ECB             :      struct colormap *cm)
     422                 : {
     423                 :     struct dfa *s;
     424                 :     struct dfa *d;
     425                 :     chr        *begin;
     426 GIC      901033 :     chr        *end = NULL;
     427                 :     chr        *cold;
     428 ECB             :     chr        *open;           /* open and close of range of possible starts */
     429                 :     chr        *close;
     430                 :     int         hitend;
     431 GIC      901033 :     int         shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
     432                 : 
     433                 :     /* first, a shot with the search RE */
     434          901033 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     435 CBC      901033 :     if (s == NULL)
     436 UIC           0 :         return v->err;
     437                 :     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
     438 GIC      901033 :     cold = NULL;
     439          901033 :     close = shortest(v, s, v->search_start, v->search_start, v->stop,
     440 ECB             :                      &cold, (int *) NULL);
     441 GIC      901033 :     freedfa(s);
     442          901033 :     NOERR();
     443 CBC      901033 :     if (v->g->cflags & REG_EXPECT)
     444 ECB             :     {
     445 GBC          27 :         assert(v->details != NULL);
     446 GIC          27 :         if (cold != NULL)
     447 CBC          27 :             v->details->rm_extend.rm_so = OFF(cold);
     448 ECB             :         else
     449 UIC           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     450 CBC          27 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     451 ECB             :     }
     452 CBC      901033 :     if (close == NULL)          /* not found */
     453 GIC      330758 :         return REG_NOMATCH;
     454 CBC      570275 :     if (v->nmatch == 0)          /* found, don't need exact location */
     455          118944 :         return REG_OKAY;
     456 ECB             : 
     457                 :     /* find starting point and match */
     458 GBC      451331 :     assert(cold != NULL);
     459 CBC      451331 :     open = cold;
     460 GIC      451331 :     cold = NULL;
     461 ECB             :     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
     462 CBC      451331 :     d = newdfa(v, cnfa, cm, &v->dfa1);
     463          451331 :     if (d == NULL)
     464 LBC           0 :         return v->err;
     465 GIC      451360 :     for (begin = open; begin <= close; begin++)
     466                 :     {
     467 ECB             :         MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
     468 CBC      451360 :         if (shorter)
     469              61 :             end = shortest(v, d, begin, begin, v->stop,
     470                 :                            (chr **) NULL, &hitend);
     471 ECB             :         else
     472 CBC      451299 :             end = longest(v, d, begin, v->stop, &hitend);
     473 GBC      451360 :         if (ISERR())
     474 ECB             :         {
     475 UIC           0 :             freedfa(d);
     476               0 :             return v->err;
     477 ECB             :         }
     478 CBC      451360 :         if (hitend && cold == NULL)
     479 GIC        3061 :             cold = begin;
     480          451360 :         if (end != NULL)
     481 CBC      451331 :             break;              /* NOTE BREAK OUT */
     482 ECB             :     }
     483 GIC      451331 :     assert(end != NULL);        /* search RE succeeded so loop should */
     484 GBC      451331 :     freedfa(d);
     485 EUB             : 
     486                 :     /* and pin down details */
     487 CBC      451331 :     assert(v->nmatch > 0);
     488          451331 :     v->pmatch[0].rm_so = OFF(begin);
     489          451331 :     v->pmatch[0].rm_eo = OFF(end);
     490          451331 :     if (v->g->cflags & REG_EXPECT)
     491                 :     {
     492              10 :         if (cold != NULL)
     493               5 :             v->details->rm_extend.rm_so = OFF(cold);
     494                 :         else
     495 GIC           5 :             v->details->rm_extend.rm_so = OFF(v->stop);
     496 CBC          10 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     497 ECB             :     }
     498 CBC      451331 :     if (v->nmatch == 1)          /* no need for submatches */
     499          449125 :         return REG_OKAY;
     500                 : 
     501 ECB             :     /* find submatches */
     502 CBC        2206 :     return cdissect(v, v->g->tree, begin, end);
     503                 : }
     504 ECB             : 
     505                 : /*
     506                 :  * cfind - find a match for the main NFA (with complications)
     507                 :  */
     508                 : static int
     509 GIC         145 : cfind(struct vars *v,
     510                 :       struct cnfa *cnfa,
     511 ECB             :       struct colormap *cm)
     512                 : {
     513                 :     struct dfa *s;
     514                 :     struct dfa *d;
     515                 :     chr        *cold;
     516                 :     int         ret;
     517                 : 
     518 CBC         145 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     519 GIC         145 :     if (s == NULL)
     520 UIC           0 :         return v->err;
     521 GIC         145 :     d = newdfa(v, cnfa, cm, &v->dfa2);
     522             145 :     if (d == NULL)
     523                 :     {
     524 UIC           0 :         freedfa(s);
     525               0 :         return v->err;
     526                 :     }
     527 ECB             : 
     528 CBC         145 :     ret = cfindloop(v, cnfa, cm, d, s, &cold);
     529 EUB             : 
     530 CBC         145 :     freedfa(d);
     531             145 :     freedfa(s);
     532 GIC         145 :     NOERR();
     533 GBC         145 :     if (v->g->cflags & REG_EXPECT)
     534 EUB             :     {
     535 UIC           0 :         assert(v->details != NULL);
     536               0 :         if (cold != NULL)
     537 LBC           0 :             v->details->rm_extend.rm_so = OFF(cold);
     538                 :         else
     539               0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     540               0 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     541 ECB             :     }
     542 CBC         145 :     return ret;
     543                 : }
     544 EUB             : 
     545                 : /*
     546                 :  * cfindloop - the heart of cfind
     547                 :  */
     548                 : static int
     549 GBC         145 : cfindloop(struct vars *v,
     550                 :           struct cnfa *cnfa,
     551 ECB             :           struct colormap *cm,
     552                 :           struct dfa *d,
     553                 :           struct dfa *s,
     554                 :           chr **coldp)          /* where to put coldstart pointer */
     555                 : {
     556                 :     chr        *begin;
     557                 :     chr        *end;
     558                 :     chr        *cold;
     559                 :     chr        *open;           /* open and close of range of possible starts */
     560                 :     chr        *close;
     561                 :     chr        *estart;
     562                 :     chr        *estop;
     563                 :     int         er;
     564 GIC         145 :     int         shorter = v->g->tree->flags & SHORTER;
     565                 :     int         hitend;
     566                 : 
     567             145 :     assert(d != NULL && s != NULL);
     568             145 :     cold = NULL;
     569             145 :     close = v->search_start;
     570                 :     do
     571                 :     {
     572                 :         /* Search with the search RE for match range at/beyond "close" */
     573 ECB             :         MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
     574 GIC         161 :         close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
     575             161 :         if (ISERR())
     576 ECB             :         {
     577 LBC           0 :             *coldp = cold;
     578               0 :             return v->err;
     579                 :         }
     580 GIC         161 :         if (close == NULL)
     581              15 :             break;              /* no more possible match anywhere */
     582             146 :         assert(cold != NULL);
     583 CBC         146 :         open = cold;
     584             146 :         cold = NULL;
     585                 :         /* Search for matches starting between "open" and "close" inclusive */
     586 EUB             :         MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
     587 GBC         513 :         for (begin = open; begin <= close; begin++)
     588                 :         {
     589 ECB             :             MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
     590 CBC         452 :             estart = begin;
     591             452 :             estop = v->stop;
     592 ECB             :             for (;;)
     593                 :             {
     594                 :                 /* Here we use the top node's detailed RE */
     595 GIC         646 :                 if (shorter)
     596 CBC          97 :                     end = shortest(v, d, begin, estart,
     597                 :                                    estop, (chr **) NULL, &hitend);
     598                 :                 else
     599             549 :                     end = longest(v, d, begin, estop,
     600 ECB             :                                   &hitend);
     601 GIC         646 :                 if (ISERR())
     602                 :                 {
     603 UIC           0 :                     *coldp = cold;
     604 LBC           0 :                     return v->err;
     605 ECB             :                 }
     606 GIC         646 :                 if (hitend && cold == NULL)
     607             104 :                     cold = begin;
     608 CBC         646 :                 if (end == NULL)
     609 GIC         349 :                     break;      /* no match with this begin point, try next */
     610 ECB             :                 MDEBUG(("tentative end %ld\n", LOFF(end)));
     611                 :                 /* Dissect the potential match to see if it really matches */
     612 GBC         297 :                 er = cdissect(v, v->g->tree, begin, end);
     613             297 :                 if (er == REG_OKAY)
     614                 :                 {
     615 CBC          85 :                     if (v->nmatch > 0)
     616 ECB             :                     {
     617 CBC          85 :                         v->pmatch[0].rm_so = OFF(begin);
     618              85 :                         v->pmatch[0].rm_eo = OFF(end);
     619                 :                     }
     620 GIC          85 :                     *coldp = cold;
     621 CBC          85 :                     return REG_OKAY;
     622 ECB             :                 }
     623 GIC         212 :                 if (er != REG_NOMATCH)
     624 ECB             :                 {
     625 UIC           0 :                     ERR(er);
     626 LBC           0 :                     *coldp = cold;
     627               0 :                     return er;
     628                 :                 }
     629 ECB             :                 /* Try next longer/shorter match with same begin point */
     630 CBC         212 :                 if (shorter)
     631                 :                 {
     632              81 :                     if (end == estop)
     633 GIC          12 :                         break;  /* no more, so try next begin point */
     634 GBC          69 :                     estart = end + 1;
     635 EUB             :                 }
     636                 :                 else
     637                 :                 {
     638 GIC         131 :                     if (end == begin)
     639 CBC           6 :                         break;  /* no more, so try next begin point */
     640 GIC         125 :                     estop = end - 1;
     641 ECB             :                 }
     642                 :             }                   /* end loop over endpoint positions */
     643                 :         }                       /* end loop over beginning positions */
     644                 : 
     645                 :         /*
     646                 :          * If we get here, there is no possible match starting at or before
     647                 :          * "close", so consider matches beyond that.  We'll do a fresh search
     648                 :          * with the search RE to find a new promising match range.
     649                 :          */
     650 GIC          61 :         close++;
     651              61 :     } while (close < v->stop);
     652                 : 
     653              60 :     *coldp = cold;
     654              60 :     return REG_NOMATCH;
     655                 : }
     656                 : 
     657                 : /*
     658                 :  * zapallsubs - initialize all subexpression matches to "no match"
     659 ECB             :  *
     660                 :  * Note that p[0], the overall-match location, is not touched.
     661                 :  */
     662                 : static void
     663 CBC     1004872 : zapallsubs(regmatch_t *p,
     664                 :            size_t n)
     665                 : {
     666                 :     size_t      i;
     667                 : 
     668 GIC     1012596 :     for (i = n - 1; i > 0; i--)
     669                 :     {
     670            7724 :         p[i].rm_so = -1;
     671            7724 :         p[i].rm_eo = -1;
     672 ECB             :     }
     673 GIC     1004872 : }
     674                 : 
     675                 : /*
     676                 :  * zaptreesubs - initialize subexpressions within subtree to "no match"
     677 ECB             :  */
     678                 : static void
     679 CBC         520 : zaptreesubs(struct vars *v,
     680 ECB             :             struct subre *t)
     681                 : {
     682 CBC         520 :     int         n = t->capno;
     683                 :     struct subre *t2;
     684                 : 
     685 GIC         520 :     if (n > 0)
     686                 :     {
     687             243 :         if ((size_t) n < v->nmatch)
     688 ECB             :         {
     689 GIC         243 :             v->pmatch[n].rm_so = -1;
     690             243 :             v->pmatch[n].rm_eo = -1;
     691 ECB             :         }
     692                 :     }
     693                 : 
     694 CBC         695 :     for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
     695 GIC         175 :         zaptreesubs(v, t2);
     696 CBC         520 : }
     697                 : 
     698 ECB             : /*
     699                 :  * subset - set subexpression match data for a successful subre
     700                 :  */
     701                 : static void
     702 GIC        4059 : subset(struct vars *v,
     703 ECB             :        struct subre *sub,
     704                 :        chr *begin,
     705                 :        chr *end)
     706                 : {
     707 GIC        4059 :     int         n = sub->capno;
     708                 : 
     709            4059 :     assert(n > 0);
     710            4059 :     if ((size_t) n >= v->nmatch)
     711 CBC           9 :         return;
     712                 : 
     713                 :     MDEBUG(("%d: setting %d = %ld-%ld\n", sub->id, n, LOFF(begin), LOFF(end)));
     714 GIC        4050 :     v->pmatch[n].rm_so = OFF(begin);
     715            4050 :     v->pmatch[n].rm_eo = OFF(end);
     716 ECB             : }
     717                 : 
     718                 : /*
     719                 :  * cdissect - check backrefs and determine subexpression matches
     720                 :  *
     721                 :  * cdissect recursively processes a subre tree to check matching of backrefs
     722                 :  * and/or identify submatch boundaries for capture nodes.  The proposed match
     723                 :  * runs from "begin" to "end" (not including "end"), and we are basically
     724                 :  * "dissecting" it to see where the submatches are.
     725                 :  *
     726                 :  * Before calling any level of cdissect, the caller must have run the node's
     727                 :  * DFA and found that the proposed substring satisfies the DFA.  (We make
     728                 :  * the caller do that because in concatenation and iteration nodes, it's
     729                 :  * much faster to check all the substrings against the child DFAs before we
     730                 :  * recurse.)
     731                 :  *
     732                 :  * A side-effect of a successful match is to save match locations for
     733                 :  * capturing subexpressions in v->pmatch[].  This is a little bit tricky,
     734                 :  * so we make the following rules:
     735                 :  * 1. Before initial entry to cdissect, all match data must have been
     736                 :  *    cleared (this is seen to by zapallsubs).
     737                 :  * 2. Before any recursive entry to cdissect, the match data for that
     738                 :  *    subexpression tree must be guaranteed clear (see zaptreesubs).
     739                 :  * 3. When returning REG_OKAY, each level of cdissect will have saved
     740                 :  *    any relevant match locations.
     741                 :  * 4. When returning REG_NOMATCH, each level of cdissect will guarantee
     742                 :  *    that its subexpression match locations are again clear.
     743                 :  * 5. No guarantees are made for error cases (i.e., other result codes).
     744                 :  * 6. When a level of cdissect abandons a successful sub-match, it will
     745                 :  *    clear that subtree's match locations with zaptreesubs before trying
     746                 :  *    any new DFA match or cdissect call for that subtree or any subtree
     747                 :  *    to its right (that is, any subtree that could have a backref into the
     748                 :  *    abandoned match).
     749                 :  * This may seem overly complicated, but it's difficult to simplify it
     750                 :  * because of the provision that match locations must be reset before
     751                 :  * any fresh DFA match (a rule that is needed to make dfa_backref safe).
     752                 :  * That means it won't work to just reset relevant match locations at the
     753                 :  * start of each cdissect level.
     754                 :  */
     755                 : static int                      /* regexec return code */
     756 GIC       15248 : cdissect(struct vars *v,
     757                 :          struct subre *t,
     758                 :          chr *begin,            /* beginning of relevant substring */
     759                 :          chr *end)              /* end of same */
     760                 : {
     761                 :     int         er;
     762                 : 
     763           15248 :     assert(t != NULL);
     764                 :     MDEBUG(("%d: cdissect %c %ld-%ld\n", t->id, t->op, LOFF(begin), LOFF(end)));
     765 ECB             : 
     766                 :     /* handy place to check for operation cancel */
     767 GNC       15248 :     INTERRUPT(v->re);
     768                 :     /* ... and stack overrun */
     769 GIC       15248 :     if (STACK_TOO_DEEP(v->re))
     770 UIC           0 :         return REG_ETOOBIG;
     771 ECB             : 
     772 GIC       15248 :     switch (t->op)
     773                 :     {
     774            8407 :         case '=':               /* terminal node */
     775 CBC        8407 :             assert(t->child == NULL);
     776 GIC        8407 :             er = REG_OKAY;      /* no action, parent did the work */
     777 CBC        8407 :             break;
     778 GBC         209 :         case 'b':               /* back reference */
     779 GIC         209 :             assert(t->child == NULL);
     780 CBC         209 :             er = cbrdissect(v, t, begin, end);
     781 GIC         209 :             break;
     782 CBC        6422 :         case '.':               /* concatenation */
     783            6422 :             assert(t->child != NULL);
     784            6422 :             if (t->child->flags & SHORTER)    /* reverse scan */
     785             164 :                 er = crevcondissect(v, t, begin, end);
     786 ECB             :             else
     787 CBC        6258 :                 er = ccondissect(v, t, begin, end);
     788            6422 :             break;
     789              68 :         case '|':               /* alternation */
     790              68 :             assert(t->child != NULL);
     791              68 :             er = caltdissect(v, t, begin, end);
     792              68 :             break;
     793             112 :         case '*':               /* iteration */
     794 GIC         112 :             assert(t->child != NULL);
     795 CBC         112 :             if (t->child->flags & SHORTER)    /* reverse scan */
     796               7 :                 er = creviterdissect(v, t, begin, end);
     797 ECB             :             else
     798 CBC         105 :                 er = citerdissect(v, t, begin, end);
     799             112 :             break;
     800              30 :         case '(':               /* no-op capture node */
     801              30 :             assert(t->child != NULL);
     802              30 :             er = cdissect(v, t->child, begin, end);
     803              30 :             break;
     804 LBC           0 :         default:
     805 UIC           0 :             er = REG_ASSERT;
     806 LBC           0 :             break;
     807 ECB             :     }
     808                 : 
     809                 :     /*
     810                 :      * We should never have a match failure unless backrefs lurk below;
     811                 :      * otherwise, either caller failed to check the DFA, or there's some
     812 EUB             :      * inconsistency between the DFA and the node's innards.
     813                 :      */
     814 GBC       15248 :     assert(er != REG_NOMATCH || (t->flags & BACKR));
     815                 : 
     816                 :     /*
     817                 :      * If this node is marked as capturing, save successful match's location.
     818                 :      */
     819 GIC       15248 :     if (t->capno > 0 && er == REG_OKAY)
     820            4059 :         subset(v, t, begin, end);
     821                 : 
     822 CBC       15248 :     return er;
     823                 : }
     824                 : 
     825                 : /*
     826                 :  * ccondissect - dissect match for concatenation node
     827 ECB             :  */
     828                 : static int                      /* regexec return code */
     829 GIC        6258 : ccondissect(struct vars *v,
     830 ECB             :             struct subre *t,
     831                 :             chr *begin,         /* beginning of relevant substring */
     832                 :             chr *end)           /* end of same */
     833                 : {
     834 GIC        6258 :     struct subre *left = t->child;
     835            6258 :     struct subre *right = left->sibling;
     836                 :     struct dfa *d;
     837 ECB             :     struct dfa *d2;
     838                 :     chr        *mid;
     839                 :     int         er;
     840                 : 
     841 GIC        6258 :     assert(t->op == '.');
     842 CBC        6258 :     assert(left != NULL && left->cnfa.nstates > 0);
     843            6258 :     assert(right != NULL && right->cnfa.nstates > 0);
     844 GIC        6258 :     assert(right->sibling == NULL);
     845            6258 :     assert(!(left->flags & SHORTER));
     846                 : 
     847            6258 :     d = getsubdfa(v, left);
     848            6258 :     NOERR();
     849 CBC        6258 :     d2 = getsubdfa(v, right);
     850            6258 :     NOERR();
     851 ECB             :     MDEBUG(("%d: ccondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
     852                 : 
     853                 :     /* pick a tentative midpoint */
     854 GIC        6258 :     mid = longest(v, d, begin, end, (int *) NULL);
     855 CBC        6258 :     NOERR();
     856            6258 :     if (mid == NULL)
     857               8 :         return REG_NOMATCH;
     858 ECB             :     MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
     859                 : 
     860                 :     /* iterate until satisfaction or failure */
     861                 :     for (;;)
     862                 :     {
     863                 :         /* try this midpoint on for size */
     864 CBC        7772 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     865 ECB             :         {
     866 GIC        6209 :             er = cdissect(v, left, begin, mid);
     867            6209 :             if (er == REG_OKAY)
     868                 :             {
     869            6159 :                 er = cdissect(v, right, mid, end);
     870            6159 :                 if (er == REG_OKAY)
     871                 :                 {
     872 ECB             :                     /* satisfaction */
     873                 :                     MDEBUG(("%d: successful\n", t->id));
     874 CBC        5911 :                     return REG_OKAY;
     875 ECB             :                 }
     876                 :                 /* Reset left's matches (right should have done so itself) */
     877 CBC         248 :                 zaptreesubs(v, left);
     878 ECB             :             }
     879 GIC         298 :             if (er != REG_NOMATCH)
     880 UIC           0 :                 return er;
     881                 :         }
     882 CBC        1861 :         NOERR();
     883                 : 
     884                 :         /* that midpoint didn't work, find a new one */
     885            1861 :         if (mid == begin)
     886                 :         {
     887 ECB             :             /* all possibilities exhausted */
     888 EUB             :             MDEBUG(("%d: no midpoint\n", t->id));
     889 GIC          75 :             return REG_NOMATCH;
     890 ECB             :         }
     891 GIC        1786 :         mid = longest(v, d, begin, mid - 1, (int *) NULL);
     892            1786 :         NOERR();
     893 CBC        1786 :         if (mid == NULL)
     894                 :         {
     895                 :             /* failed to find a new one */
     896                 :             MDEBUG(("%d: failed midpoint\n", t->id));
     897             264 :             return REG_NOMATCH;
     898                 :         }
     899 ECB             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     900                 :     }
     901                 : 
     902                 :     /* can't get here */
     903                 :     return REG_ASSERT;
     904                 : }
     905                 : 
     906                 : /*
     907                 :  * crevcondissect - dissect match for concatenation node, shortest-first
     908                 :  */
     909                 : static int                      /* regexec return code */
     910 GIC         164 : crevcondissect(struct vars *v,
     911                 :                struct subre *t,
     912                 :                chr *begin,      /* beginning of relevant substring */
     913                 :                chr *end)        /* end of same */
     914                 : {
     915             164 :     struct subre *left = t->child;
     916             164 :     struct subre *right = left->sibling;
     917                 :     struct dfa *d;
     918 ECB             :     struct dfa *d2;
     919                 :     chr        *mid;
     920                 :     int         er;
     921                 : 
     922 GIC         164 :     assert(t->op == '.');
     923 CBC         164 :     assert(left != NULL && left->cnfa.nstates > 0);
     924             164 :     assert(right != NULL && right->cnfa.nstates > 0);
     925 GIC         164 :     assert(right->sibling == NULL);
     926             164 :     assert(left->flags & SHORTER);
     927                 : 
     928             164 :     d = getsubdfa(v, left);
     929             164 :     NOERR();
     930 CBC         164 :     d2 = getsubdfa(v, right);
     931             164 :     NOERR();
     932 ECB             :     MDEBUG(("%d: crevcondissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
     933                 : 
     934                 :     /* pick a tentative midpoint */
     935 GIC         164 :     mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
     936 CBC         164 :     NOERR();
     937             164 :     if (mid == NULL)
     938 LBC           0 :         return REG_NOMATCH;
     939 ECB             :     MDEBUG(("%d: tentative midpoint %ld\n", t->id, LOFF(mid)));
     940                 : 
     941                 :     /* iterate until satisfaction or failure */
     942                 :     for (;;)
     943                 :     {
     944                 :         /* try this midpoint on for size */
     945 CBC         605 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     946 EUB             :         {
     947 GIC          93 :             er = cdissect(v, left, begin, mid);
     948              93 :             if (er == REG_OKAY)
     949                 :             {
     950              93 :                 er = cdissect(v, right, mid, end);
     951              93 :                 if (er == REG_OKAY)
     952                 :                 {
     953 ECB             :                     /* satisfaction */
     954                 :                     MDEBUG(("%d: successful\n", t->id));
     955 CBC          81 :                     return REG_OKAY;
     956 ECB             :                 }
     957                 :                 /* Reset left's matches (right should have done so itself) */
     958 CBC          12 :                 zaptreesubs(v, left);
     959 ECB             :             }
     960 GIC          12 :             if (er != REG_NOMATCH)
     961 UIC           0 :                 return er;
     962                 :         }
     963 CBC         524 :         NOERR();
     964                 : 
     965                 :         /* that midpoint didn't work, find a new one */
     966             524 :         if (mid == end)
     967                 :         {
     968 ECB             :             /* all possibilities exhausted */
     969 EUB             :             MDEBUG(("%d: no midpoint\n", t->id));
     970 GIC          83 :             return REG_NOMATCH;
     971 ECB             :         }
     972 GIC         441 :         mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
     973             441 :         NOERR();
     974 CBC         441 :         if (mid == NULL)
     975                 :         {
     976                 :             /* failed to find a new one */
     977                 :             MDEBUG(("%d: failed midpoint\n", t->id));
     978 LBC           0 :             return REG_NOMATCH;
     979                 :         }
     980 ECB             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     981                 :     }
     982                 : 
     983                 :     /* can't get here */
     984                 :     return REG_ASSERT;
     985                 : }
     986 EUB             : 
     987                 : /*
     988                 :  * cbrdissect - dissect match for backref node
     989                 :  *
     990                 :  * The backref match might already have been verified by dfa_backref(),
     991                 :  * but we don't know that for sure so must check it here.
     992                 :  */
     993                 : static int                      /* regexec return code */
     994 GIC         209 : cbrdissect(struct vars *v,
     995                 :            struct subre *t,
     996                 :            chr *begin,          /* beginning of relevant substring */
     997                 :            chr *end)            /* end of same */
     998                 : {
     999             209 :     int         n = t->backno;
    1000                 :     size_t      numreps;
    1001                 :     size_t      tlen;
    1002 ECB             :     size_t      brlen;
    1003                 :     chr        *brstring;
    1004                 :     chr        *p;
    1005 GIC         209 :     int         min = t->min;
    1006             209 :     int         max = t->max;
    1007 ECB             : 
    1008 GIC         209 :     assert(t != NULL);
    1009             209 :     assert(t->op == 'b');
    1010             209 :     assert(n >= 0);
    1011             209 :     assert((size_t) n < v->nmatch);
    1012                 : 
    1013 ECB             :     MDEBUG(("%d: cbrdissect %d{%d-%d} %ld-%ld\n", t->id, n, min, max,
    1014                 :             LOFF(begin), LOFF(end)));
    1015                 : 
    1016                 :     /* get the backreferenced string */
    1017 CBC         209 :     if (v->pmatch[n].rm_so == -1)
    1018              55 :         return REG_NOMATCH;
    1019             154 :     brstring = v->start + v->pmatch[n].rm_so;
    1020 GIC         154 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
    1021                 : 
    1022                 :     /* special cases for zero-length strings */
    1023             154 :     if (brlen == 0)
    1024                 :     {
    1025 ECB             :         /*
    1026                 :          * matches only if target is zero length, but any number of
    1027                 :          * repetitions can be considered to be present
    1028                 :          */
    1029 GIC           7 :         if (begin == end && min <= max)
    1030                 :         {
    1031 ECB             :             MDEBUG(("%d: backref matched trivially\n", t->id));
    1032 GIC           7 :             return REG_OKAY;
    1033                 :         }
    1034 UIC           0 :         return REG_NOMATCH;
    1035                 :     }
    1036 GIC         147 :     if (begin == end)
    1037 ECB             :     {
    1038                 :         /* matches only if zero repetitions are okay */
    1039 GIC           6 :         if (min == 0)
    1040 ECB             :         {
    1041                 :             MDEBUG(("%d: backref matched trivially\n", t->id));
    1042 GBC           5 :             return REG_OKAY;
    1043                 :         }
    1044 CBC           1 :         return REG_NOMATCH;
    1045                 :     }
    1046                 : 
    1047 ECB             :     /*
    1048                 :      * check target length to see if it could possibly be an allowed number of
    1049                 :      * repetitions of brstring
    1050                 :      */
    1051 GIC         141 :     assert(end > begin);
    1052 CBC         141 :     tlen = end - begin;
    1053 GIC         141 :     if (tlen % brlen != 0)
    1054               5 :         return REG_NOMATCH;
    1055             136 :     numreps = tlen / brlen;
    1056             136 :     if (numreps < min || (numreps > max && max != DUPINF))
    1057               3 :         return REG_NOMATCH;
    1058                 : 
    1059 ECB             :     /* okay, compare the actual string contents */
    1060 CBC         133 :     p = begin;
    1061             236 :     while (numreps-- > 0)
    1062 ECB             :     {
    1063 CBC         151 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
    1064              48 :             return REG_NOMATCH;
    1065             103 :         p += brlen;
    1066                 :     }
    1067                 : 
    1068 ECB             :     MDEBUG(("%d: backref matched\n", t->id));
    1069 CBC          85 :     return REG_OKAY;
    1070                 : }
    1071 ECB             : 
    1072                 : /*
    1073                 :  * caltdissect - dissect match for alternation node
    1074                 :  */
    1075                 : static int                      /* regexec return code */
    1076 GIC          68 : caltdissect(struct vars *v,
    1077 ECB             :             struct subre *t,
    1078                 :             chr *begin,         /* beginning of relevant substring */
    1079                 :             chr *end)           /* end of same */
    1080                 : {
    1081                 :     struct dfa *d;
    1082                 :     int         er;
    1083                 : 
    1084 CBC          68 :     assert(t->op == '|');
    1085                 : 
    1086 GIC          68 :     t = t->child;
    1087                 :     /* there should be at least 2 alternatives */
    1088              68 :     assert(t != NULL && t->sibling != NULL);
    1089                 : 
    1090             125 :     while (t != NULL)
    1091                 :     {
    1092 CBC          99 :         assert(t->cnfa.nstates > 0);
    1093                 : 
    1094 ECB             :         MDEBUG(("%d: caltdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1095                 : 
    1096 CBC          99 :         d = getsubdfa(v, t);
    1097 GIC          99 :         NOERR();
    1098 CBC          99 :         if (longest(v, d, begin, end, (int *) NULL) == end)
    1099                 :         {
    1100 ECB             :             MDEBUG(("%d: caltdissect matched\n", t->id));
    1101 GIC          76 :             er = cdissect(v, t, begin, end);
    1102              76 :             if (er != REG_NOMATCH)
    1103              42 :                 return er;
    1104 ECB             :         }
    1105 CBC          57 :         NOERR();
    1106 ECB             : 
    1107 GIC          57 :         t = t->sibling;
    1108                 :     }
    1109 ECB             : 
    1110 CBC          26 :     return REG_NOMATCH;
    1111 ECB             : }
    1112                 : 
    1113                 : /*
    1114                 :  * citerdissect - dissect match for iteration node
    1115                 :  */
    1116                 : static int                      /* regexec return code */
    1117 GIC         105 : citerdissect(struct vars *v,
    1118 ECB             :              struct subre *t,
    1119                 :              chr *begin,        /* beginning of relevant substring */
    1120                 :              chr *end)          /* end of same */
    1121                 : {
    1122                 :     struct dfa *d;
    1123                 :     chr       **endpts;
    1124                 :     chr        *limit;
    1125                 :     int         min_matches;
    1126                 :     size_t      max_matches;
    1127                 :     int         nverified;
    1128                 :     int         k;
    1129                 :     int         i;
    1130                 :     int         er;
    1131                 : 
    1132 GIC         105 :     assert(t->op == '*');
    1133             105 :     assert(t->child != NULL && t->child->cnfa.nstates > 0);
    1134             105 :     assert(!(t->child->flags & SHORTER));
    1135             105 :     assert(begin <= end);
    1136                 : 
    1137                 :     MDEBUG(("%d: citerdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1138                 : 
    1139                 :     /*
    1140 ECB             :      * For the moment, assume the minimum number of matches is 1.  If zero
    1141                 :      * matches are allowed, and the target string is empty, we are allowed to
    1142                 :      * match regardless of the contents of the iter node --- but we would
    1143                 :      * prefer to match once, so that capturing parens get set.  (An example of
    1144                 :      * the concern here is a pattern like "()*\1", which historically this
    1145                 :      * code has allowed to succeed.)  Therefore, we deal with the zero-matches
    1146                 :      * case at the bottom, after failing to find any other way to match.
    1147                 :      */
    1148 GIC         105 :     min_matches = t->min;
    1149             105 :     if (min_matches <= 0)
    1150              69 :         min_matches = 1;
    1151                 : 
    1152                 :     /*
    1153                 :      * We need workspace to track the endpoints of each sub-match.  Normally
    1154                 :      * we consider only nonzero-length sub-matches, so there can be at most
    1155                 :      * end-begin of them.  However, if min is larger than that, we will also
    1156 ECB             :      * consider zero-length sub-matches in order to find enough matches.
    1157                 :      *
    1158                 :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1159                 :      * sub-match endpoints in endpts[1..max_matches].
    1160                 :      */
    1161 GIC         105 :     max_matches = end - begin;
    1162             105 :     if (max_matches > t->max && t->max != DUPINF)
    1163 UIC           0 :         max_matches = t->max;
    1164 GIC         105 :     if (max_matches < min_matches)
    1165              61 :         max_matches = min_matches;
    1166             105 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1167             105 :     if (endpts == NULL)
    1168 UIC           0 :         return REG_ESPACE;
    1169 CBC         105 :     endpts[0] = begin;
    1170 ECB             : 
    1171 GBC         105 :     d = getsubdfa(v, t->child);
    1172 CBC         105 :     if (ISERR())
    1173 ECB             :     {
    1174 LBC           0 :         FREE(endpts);
    1175               0 :         return v->err;
    1176 EUB             :     }
    1177 ECB             : 
    1178                 :     /*
    1179                 :      * Our strategy is to first find a set of sub-match endpoints that are
    1180                 :      * valid according to the child node's DFA, and then recursively dissect
    1181                 :      * each sub-match to confirm validity.  If any validity check fails,
    1182 EUB             :      * backtrack that sub-match and try again.  And, when we next try for a
    1183                 :      * validity check, we need not recheck any successfully verified
    1184                 :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1185                 :      * how many sub-matches are currently known okay.
    1186                 :      */
    1187                 : 
    1188                 :     /* initialize to consider first sub-match */
    1189 GIC         105 :     nverified = 0;
    1190             105 :     k = 1;
    1191             105 :     limit = end;
    1192                 : 
    1193                 :     /* iterate until satisfaction or failure */
    1194             479 :     while (k > 0)
    1195                 :     {
    1196                 :         /* try to find an endpoint for the k'th sub-match */
    1197 CBC         395 :         endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
    1198             395 :         if (ISERR())
    1199 ECB             :         {
    1200 UIC           0 :             FREE(endpts);
    1201               0 :             return v->err;
    1202 ECB             :         }
    1203 GIC         395 :         if (endpts[k] == NULL)
    1204                 :         {
    1205 ECB             :             /* no match possible, so see if we can shorten previous one */
    1206 CBC         200 :             k--;
    1207 GIC         200 :             goto backtrack;
    1208 EUB             :         }
    1209                 :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1210                 :                 t->id, k, LOFF(endpts[k])));
    1211 ECB             : 
    1212                 :         /* k'th sub-match can no longer be considered verified */
    1213 GIC         195 :         if (nverified >= k)
    1214 CBC           8 :             nverified = k - 1;
    1215 ECB             : 
    1216 GIC         195 :         if (endpts[k] != end)
    1217                 :         {
    1218                 :             /* haven't reached end yet, try another iteration if allowed */
    1219             134 :             if (k >= max_matches)
    1220                 :             {
    1221 ECB             :                 /* must try to shorten some previous match */
    1222 LBC           0 :                 k--;
    1223 UIC           0 :                 goto backtrack;
    1224 ECB             :             }
    1225                 : 
    1226                 :             /* reject zero-length match unless necessary to achieve min */
    1227 CBC         134 :             if (endpts[k] == endpts[k - 1] &&
    1228 UIC           0 :                 (k >= min_matches || min_matches - k < end - endpts[k]))
    1229               0 :                 goto backtrack;
    1230 EUB             : 
    1231 GBC         134 :             k++;
    1232 GIC         134 :             limit = end;
    1233             134 :             continue;
    1234                 :         }
    1235 ECB             : 
    1236 EUB             :         /*
    1237                 :          * We've identified a way to divide the string into k sub-matches that
    1238                 :          * works so far as the child DFA can tell.  If k is an allowed number
    1239 ECB             :          * of matches, start the slow part: recurse to verify each sub-match.
    1240                 :          * We always have k <= max_matches, needn't check that.
    1241                 :          */
    1242 GIC          61 :         if (k < min_matches)
    1243 UIC           0 :             goto backtrack;
    1244                 : 
    1245                 :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1246                 : 
    1247 GIC         100 :         for (i = nverified + 1; i <= k; i++)
    1248                 :         {
    1249                 :             /* zap any match data from a non-last iteration */
    1250 CBC          79 :             zaptreesubs(v, t->child);
    1251 GBC          79 :             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
    1252 GIC          79 :             if (er == REG_OKAY)
    1253                 :             {
    1254              39 :                 nverified = i;
    1255 CBC          39 :                 continue;
    1256                 :             }
    1257 GIC          40 :             if (er == REG_NOMATCH)
    1258 CBC          40 :                 break;
    1259 ECB             :             /* oops, something failed */
    1260 LBC           0 :             FREE(endpts);
    1261 UIC           0 :             return er;
    1262 ECB             :         }
    1263                 : 
    1264 GIC          61 :         if (i > k)
    1265 ECB             :         {
    1266                 :             /* satisfaction */
    1267                 :             MDEBUG(("%d: successful\n", t->id));
    1268 GBC          21 :             FREE(endpts);
    1269              21 :             return REG_OKAY;
    1270                 :         }
    1271                 : 
    1272 ECB             :         /* i'th match failed to verify, so backtrack it */
    1273 GIC          40 :         k = i;
    1274                 : 
    1275             240 : backtrack:
    1276 ECB             : 
    1277                 :         /*
    1278                 :          * Must consider shorter versions of the k'th sub-match.  However,
    1279                 :          * we'll only ask for a zero-length match if necessary.
    1280                 :          */
    1281 CBC         240 :         while (k > 0)
    1282                 :         {
    1283             156 :             chr        *prev_end = endpts[k - 1];
    1284                 : 
    1285 GIC         156 :             if (endpts[k] > prev_end)
    1286                 :             {
    1287             156 :                 limit = endpts[k] - 1;
    1288             156 :                 if (limit > prev_end ||
    1289 LBC           0 :                     (k < min_matches && min_matches - k >= end - prev_end))
    1290                 :                 {
    1291 ECB             :                     /* break out of backtrack loop, continue the outer one */
    1292                 :                     break;
    1293                 :                 }
    1294                 :             }
    1295                 :             /* can't shorten k'th sub-match any more, consider previous one */
    1296 LBC           0 :             k--;
    1297 EUB             :         }
    1298                 :     }
    1299                 : 
    1300                 :     /* all possibilities exhausted */
    1301 GIC          84 :     FREE(endpts);
    1302                 : 
    1303                 :     /*
    1304 EUB             :      * Now consider the possibility that we can match to a zero-length string
    1305                 :      * by using zero repetitions.
    1306                 :      */
    1307 GIC          84 :     if (t->min == 0 && begin == end)
    1308                 :     {
    1309 ECB             :         MDEBUG(("%d: allowing zero matches\n", t->id));
    1310 GIC          56 :         return REG_OKAY;
    1311                 :     }
    1312                 : 
    1313                 :     MDEBUG(("%d: failed\n", t->id));
    1314              28 :     return REG_NOMATCH;
    1315 ECB             : }
    1316                 : 
    1317                 : /*
    1318                 :  * creviterdissect - dissect match for iteration node, shortest-first
    1319                 :  */
    1320                 : static int                      /* regexec return code */
    1321 GIC           7 : creviterdissect(struct vars *v,
    1322 ECB             :                 struct subre *t,
    1323                 :                 chr *begin,     /* beginning of relevant substring */
    1324                 :                 chr *end)       /* end of same */
    1325                 : {
    1326                 :     struct dfa *d;
    1327                 :     chr       **endpts;
    1328                 :     chr        *limit;
    1329                 :     int         min_matches;
    1330                 :     size_t      max_matches;
    1331                 :     int         nverified;
    1332                 :     int         k;
    1333                 :     int         i;
    1334                 :     int         er;
    1335                 : 
    1336 GIC           7 :     assert(t->op == '*');
    1337               7 :     assert(t->child != NULL && t->child->cnfa.nstates > 0);
    1338               7 :     assert(t->child->flags & SHORTER);
    1339               7 :     assert(begin <= end);
    1340                 : 
    1341                 :     MDEBUG(("%d: creviterdissect %ld-%ld\n", t->id, LOFF(begin), LOFF(end)));
    1342                 : 
    1343                 :     /*
    1344 ECB             :      * If zero matches are allowed, and target string is empty, just declare
    1345                 :      * victory.  OTOH, if target string isn't empty, zero matches can't work
    1346                 :      * so we pretend the min is 1.
    1347                 :      */
    1348 GIC           7 :     min_matches = t->min;
    1349               7 :     if (min_matches <= 0)
    1350                 :     {
    1351               7 :         if (begin == end)
    1352                 :         {
    1353                 :             MDEBUG(("%d: allowing zero matches\n", t->id));
    1354               1 :             return REG_OKAY;
    1355                 :         }
    1356 CBC           6 :         min_matches = 1;
    1357 ECB             :     }
    1358                 : 
    1359                 :     /*
    1360                 :      * We need workspace to track the endpoints of each sub-match.  Normally
    1361                 :      * we consider only nonzero-length sub-matches, so there can be at most
    1362                 :      * end-begin of them.  However, if min is larger than that, we will also
    1363                 :      * consider zero-length sub-matches in order to find enough matches.
    1364                 :      *
    1365                 :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1366                 :      * sub-match endpoints in endpts[1..max_matches].
    1367                 :      */
    1368 GIC           6 :     max_matches = end - begin;
    1369               6 :     if (max_matches > t->max && t->max != DUPINF)
    1370               6 :         max_matches = t->max;
    1371               6 :     if (max_matches < min_matches)
    1372 UIC           0 :         max_matches = min_matches;
    1373 GIC           6 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1374               6 :     if (endpts == NULL)
    1375 UIC           0 :         return REG_ESPACE;
    1376 CBC           6 :     endpts[0] = begin;
    1377 ECB             : 
    1378 CBC           6 :     d = getsubdfa(v, t->child);
    1379               6 :     if (ISERR())
    1380 EUB             :     {
    1381 LBC           0 :         FREE(endpts);
    1382               0 :         return v->err;
    1383 EUB             :     }
    1384 ECB             : 
    1385                 :     /*
    1386                 :      * Our strategy is to first find a set of sub-match endpoints that are
    1387                 :      * valid according to the child node's DFA, and then recursively dissect
    1388                 :      * each sub-match to confirm validity.  If any validity check fails,
    1389 EUB             :      * backtrack that sub-match and try again.  And, when we next try for a
    1390                 :      * validity check, we need not recheck any successfully verified
    1391                 :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1392                 :      * how many sub-matches are currently known okay.
    1393                 :      */
    1394                 : 
    1395                 :     /* initialize to consider first sub-match */
    1396 GIC           6 :     nverified = 0;
    1397               6 :     k = 1;
    1398               6 :     limit = begin;
    1399                 : 
    1400                 :     /* iterate until satisfaction or failure */
    1401               8 :     while (k > 0)
    1402                 :     {
    1403                 :         /* disallow zero-length match unless necessary to achieve min */
    1404 CBC           6 :         if (limit == endpts[k - 1] &&
    1405               6 :             limit != end &&
    1406 LBC           0 :             (k >= min_matches || min_matches - k < end - limit))
    1407 GIC           6 :             limit++;
    1408                 : 
    1409 ECB             :         /* if this is the last allowed sub-match, it must reach to the end */
    1410 GIC           6 :         if (k >= max_matches)
    1411               6 :             limit = end;
    1412 ECB             : 
    1413                 :         /* try to find an endpoint for the k'th sub-match */
    1414 GBC           6 :         endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
    1415 ECB             :                              (chr **) NULL, (int *) NULL);
    1416 GIC           6 :         if (ISERR())
    1417                 :         {
    1418 LBC           0 :             FREE(endpts);
    1419               0 :             return v->err;
    1420                 :         }
    1421 GIC           6 :         if (endpts[k] == NULL)
    1422 ECB             :         {
    1423                 :             /* no match possible, so see if we can lengthen previous one */
    1424 LBC           0 :             k--;
    1425 UIC           0 :             goto backtrack;
    1426 EUB             :         }
    1427                 :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1428                 :                 t->id, k, LOFF(endpts[k])));
    1429 ECB             : 
    1430                 :         /* k'th sub-match can no longer be considered verified */
    1431 GIC           6 :         if (nverified >= k)
    1432 UBC           0 :             nverified = k - 1;
    1433 EUB             : 
    1434 GIC           6 :         if (endpts[k] != end)
    1435                 :         {
    1436                 :             /* haven't reached end yet, try another iteration if allowed */
    1437 UIC           0 :             if (k >= max_matches)
    1438                 :             {
    1439 ECB             :                 /* must try to lengthen some previous match */
    1440 UBC           0 :                 k--;
    1441 UIC           0 :                 goto backtrack;
    1442 ECB             :             }
    1443                 : 
    1444 UIC           0 :             k++;
    1445 UBC           0 :             limit = endpts[k - 1];
    1446 UIC           0 :             continue;
    1447                 :         }
    1448 EUB             : 
    1449                 :         /*
    1450                 :          * We've identified a way to divide the string into k sub-matches that
    1451                 :          * works so far as the child DFA can tell.  If k is an allowed number
    1452                 :          * of matches, start the slow part: recurse to verify each sub-match.
    1453                 :          * We always have k <= max_matches, needn't check that.
    1454                 :          */
    1455 GIC           6 :         if (k < min_matches)
    1456 UIC           0 :             goto backtrack;
    1457                 : 
    1458                 :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1459                 : 
    1460 GIC          10 :         for (i = nverified + 1; i <= k; i++)
    1461                 :         {
    1462                 :             /* zap any match data from a non-last iteration */
    1463 CBC           6 :             zaptreesubs(v, t->child);
    1464 GBC           6 :             er = cdissect(v, t->child, endpts[i - 1], endpts[i]);
    1465 GIC           6 :             if (er == REG_OKAY)
    1466                 :             {
    1467               4 :                 nverified = i;
    1468 CBC           4 :                 continue;
    1469                 :             }
    1470 GIC           2 :             if (er == REG_NOMATCH)
    1471 CBC           2 :                 break;
    1472 ECB             :             /* oops, something failed */
    1473 LBC           0 :             FREE(endpts);
    1474 UIC           0 :             return er;
    1475 ECB             :         }
    1476                 : 
    1477 GIC           6 :         if (i > k)
    1478 ECB             :         {
    1479                 :             /* satisfaction */
    1480                 :             MDEBUG(("%d: successful\n", t->id));
    1481 GBC           4 :             FREE(endpts);
    1482               4 :             return REG_OKAY;
    1483                 :         }
    1484                 : 
    1485 ECB             :         /* i'th match failed to verify, so backtrack it */
    1486 GIC           2 :         k = i;
    1487                 : 
    1488               2 : backtrack:
    1489 ECB             : 
    1490                 :         /*
    1491                 :          * Must consider longer versions of the k'th sub-match.
    1492                 :          */
    1493 GIC           4 :         while (k > 0)
    1494 ECB             :         {
    1495 GIC           2 :             if (endpts[k] < end)
    1496 ECB             :             {
    1497 UIC           0 :                 limit = endpts[k] + 1;
    1498                 :                 /* break out of backtrack loop, continue the outer one */
    1499               0 :                 break;
    1500                 :             }
    1501 ECB             :             /* can't lengthen k'th sub-match any more, consider previous one */
    1502 GIC           2 :             k--;
    1503 ECB             :         }
    1504                 :     }
    1505 EUB             : 
    1506                 :     /* all possibilities exhausted */
    1507                 :     MDEBUG(("%d: failed\n", t->id));
    1508 GIC           2 :     FREE(endpts);
    1509               2 :     return REG_NOMATCH;
    1510 ECB             : }
    1511                 : 
    1512                 : 
    1513                 : 
    1514                 : #include "rege_dfa.c"
        

Generated by: LCOV version v1.16-55-g56c0a2a