LCOV - differential code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 86.6 % 82 71 11 71
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 2 2 2
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * regprefix.c
       4                 :  *    Extract a common prefix, if any, from a compiled regex.
       5                 :  *
       6                 :  *
       7                 :  * Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group
       8                 :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/regex/regprefix.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "regex/regguts.h"
      17                 : 
      18                 : 
      19                 : /*
      20                 :  * forward declarations
      21                 :  */
      22                 : static int  findprefix(struct cnfa *cnfa, struct colormap *cm,
      23                 :                        chr *string, size_t *slength);
      24                 : 
      25                 : 
      26                 : /*
      27                 :  * pg_regprefix - get common prefix for regular expression
      28                 :  *
      29                 :  * Returns one of:
      30                 :  *  REG_NOMATCH: there is no common prefix of strings matching the regex
      31                 :  *  REG_PREFIX: there is a common prefix of strings matching the regex
      32                 :  *  REG_EXACT: all strings satisfying the regex must match the same string
      33                 :  *  or a REG_XXX error code
      34                 :  *
      35                 :  * In the non-failure cases, *string is set to a palloc'd string containing
      36                 :  * the common prefix or exact value, of length *slength (measured in chrs
      37                 :  * not bytes!).
      38                 :  *
      39                 :  * This function does not analyze all complex cases (such as lookaround
      40                 :  * constraints) exactly.  Therefore it is possible that some strings matching
      41                 :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42                 :  * it should never be the case that a string satisfying the regex does not
      43                 :  * match the reported prefix or exact-match string.
      44                 :  */
      45                 : int
      46 CBC        6449 : pg_regprefix(regex_t *re,
      47                 :              chr **string,
      48                 :              size_t *slength)
      49                 : {
      50                 :     struct guts *g;
      51                 :     struct cnfa *cnfa;
      52                 :     int         st;
      53                 : 
      54                 :     /* sanity checks */
      55            6449 :     if (string == NULL || slength == NULL)
      56 UBC           0 :         return REG_INVARG;
      57 CBC        6449 :     *string = NULL;             /* initialize for failure cases */
      58            6449 :     *slength = 0;
      59            6449 :     if (re == NULL || re->re_magic != REMAGIC)
      60 UBC           0 :         return REG_INVARG;
      61 CBC        6449 :     if (re->re_csize != sizeof(chr))
      62 UBC           0 :         return REG_MIXED;
      63                 : 
      64                 :     /* Initialize locale-dependent support */
      65 CBC        6449 :     pg_set_regex_collation(re->re_collation);
      66                 : 
      67                 :     /* setup */
      68            6449 :     g = (struct guts *) re->re_guts;
      69            6449 :     if (g->info & REG_UIMPOSSIBLE)
      70               1 :         return REG_NOMATCH;
      71                 : 
      72                 :     /*
      73                 :      * This implementation considers only the search NFA for the topmost regex
      74                 :      * tree node.  Therefore, constraints such as backrefs are not fully
      75                 :      * applied, which is allowed per the function's API spec.
      76                 :      */
      77            6448 :     assert(g->tree != NULL);
      78            6448 :     cnfa = &g->tree->cnfa;
      79                 : 
      80                 :     /* matchall NFAs never have a fixed prefix */
      81            6448 :     if (cnfa->flags & MATCHALL)
      82               6 :         return REG_NOMATCH;
      83                 : 
      84                 :     /*
      85                 :      * Since a correct NFA should never contain any exit-free loops, it should
      86                 :      * not be possible for our traversal to return to a previously visited NFA
      87                 :      * state.  Hence we need at most nstates chrs in the output string.
      88                 :      */
      89            6442 :     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      90            6442 :     if (*string == NULL)
      91 UBC           0 :         return REG_ESPACE;
      92                 : 
      93                 :     /* do it */
      94 CBC        6442 :     st = findprefix(cnfa, &g->cmap, *string, slength);
      95                 : 
      96            6442 :     assert(*slength <= cnfa->nstates);
      97                 : 
      98                 :     /* clean up */
      99            6442 :     if (st != REG_PREFIX && st != REG_EXACT)
     100                 :     {
     101             357 :         FREE(*string);
     102             357 :         *string = NULL;
     103             357 :         *slength = 0;
     104                 :     }
     105                 : 
     106            6442 :     return st;
     107                 : }
     108                 : 
     109                 : /*
     110                 :  * findprefix - extract common prefix from cNFA
     111                 :  *
     112                 :  * Results are returned into the preallocated chr array string[], with
     113                 :  * *slength (which must be preset to zero) incremented for each chr.
     114                 :  */
     115                 : static int                      /* regprefix return code */
     116            6442 : findprefix(struct cnfa *cnfa,
     117                 :            struct colormap *cm,
     118                 :            chr *string,
     119                 :            size_t *slength)
     120                 : {
     121                 :     int         st;
     122                 :     int         nextst;
     123                 :     color       thiscolor;
     124                 :     chr         c;
     125                 :     struct carc *ca;
     126                 : 
     127                 :     /*
     128                 :      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     129                 :      * anchored left.  If we have both BOS and BOL, they must go to the same
     130                 :      * next state.
     131                 :      */
     132            6442 :     st = cnfa->pre;
     133            6442 :     nextst = -1;
     134           12621 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     135                 :     {
     136            6448 :         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     137                 :         {
     138            6185 :             if (nextst == -1)
     139            6179 :                 nextst = ca->to;
     140               6 :             else if (nextst != ca->to)
     141               6 :                 return REG_NOMATCH;
     142                 :         }
     143                 :         else
     144             263 :             return REG_NOMATCH;
     145                 :     }
     146            6173 :     if (nextst == -1)
     147 UBC           0 :         return REG_NOMATCH;
     148                 : 
     149                 :     /*
     150                 :      * Scan through successive states, stopping as soon as we find one with
     151                 :      * more than one acceptable transition character (either multiple colors
     152                 :      * on out-arcs, or a color with more than one member chr).
     153                 :      *
     154                 :      * We could find a state with multiple out-arcs that are all labeled with
     155                 :      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     156                 :      * In that case we add the chr "c" to the output string but then exit the
     157                 :      * loop with nextst == -1.  This leaves a little bit on the table: if the
     158                 :      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     159                 :      * to the prefix.  But chasing multiple parallel state chains doesn't seem
     160                 :      * worth the trouble.
     161                 :      */
     162                 :     do
     163                 :     {
     164 CBC       79814 :         st = nextst;
     165           79814 :         nextst = -1;
     166           79814 :         thiscolor = COLORLESS;
     167          153535 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     168                 :         {
     169                 :             /* We can ignore BOS/BOL arcs */
     170           79835 :             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     171 UBC           0 :                 continue;
     172                 : 
     173                 :             /*
     174                 :              * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
     175                 :              * and LACONs
     176                 :              */
     177 CBC       79835 :             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     178           74376 :                 ca->co == RAINBOW || ca->co >= cnfa->ncolors)
     179                 :             {
     180            6105 :                 thiscolor = COLORLESS;
     181            6105 :                 break;
     182                 :             }
     183           73730 :             if (thiscolor == COLORLESS)
     184                 :             {
     185                 :                 /* First plain outarc */
     186           73715 :                 thiscolor = ca->co;
     187           73715 :                 nextst = ca->to;
     188                 :             }
     189              15 :             else if (thiscolor == ca->co)
     190                 :             {
     191                 :                 /* Another plain outarc for same color */
     192               6 :                 nextst = -1;
     193                 :             }
     194                 :             else
     195                 :             {
     196                 :                 /* More than one plain outarc color terminates the search */
     197               9 :                 thiscolor = COLORLESS;
     198               9 :                 break;
     199                 :             }
     200                 :         }
     201                 :         /* Done if we didn't find exactly one color on plain outarcs */
     202           79814 :         if (thiscolor == COLORLESS)
     203            6114 :             break;
     204                 :         /* The color must be a singleton */
     205           73700 :         if (cm->cd[thiscolor].nschrs != 1)
     206              53 :             break;
     207                 :         /* Must not have any high-color-map entries */
     208           73647 :         if (cm->cd[thiscolor].nuchrs != 0)
     209 UBC           0 :             break;
     210                 : 
     211                 :         /*
     212                 :          * Identify the color's sole member chr and add it to the prefix
     213                 :          * string.  In general the colormap data structure doesn't provide a
     214                 :          * way to find color member chrs, except by trying GETCOLOR() on each
     215                 :          * possible chr value, which won't do at all.  However, for the cases
     216                 :          * we care about it should be sufficient to test the "firstchr" value,
     217                 :          * that is the first chr ever added to the color.  There are cases
     218                 :          * where this might no longer be a member of the color (so we do need
     219                 :          * to test), but none of them are likely to arise for a character that
     220                 :          * is a member of a common prefix.  If we do hit such a corner case,
     221                 :          * we just fall out without adding anything to the prefix string.
     222                 :          */
     223 CBC       73647 :         c = cm->cd[thiscolor].firstchr;
     224           73647 :         if (GETCOLOR(cm, c) != thiscolor)
     225 UBC           0 :             break;
     226                 : 
     227 CBC       73647 :         string[(*slength)++] = c;
     228                 : 
     229                 :         /* Advance to next state, but only if we have a unique next state */
     230           73647 :     } while (nextst != -1);
     231                 : 
     232                 :     /*
     233                 :      * If we ended at a state that only has EOS/EOL outarcs leading to the
     234                 :      * "post" state, then we have an exact-match string.  Note this is true
     235                 :      * even if the string is of zero length.
     236                 :      */
     237            6173 :     nextst = -1;
     238           11632 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     239                 :     {
     240            6173 :         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     241                 :         {
     242            5459 :             if (nextst == -1)
     243            5459 :                 nextst = ca->to;
     244 UBC           0 :             else if (nextst != ca->to)
     245                 :             {
     246               0 :                 nextst = -1;
     247               0 :                 break;
     248                 :             }
     249                 :         }
     250                 :         else
     251                 :         {
     252 CBC         714 :             nextst = -1;
     253             714 :             break;
     254                 :         }
     255                 :     }
     256            6173 :     if (nextst == cnfa->post)
     257            5459 :         return REG_EXACT;
     258                 : 
     259                 :     /*
     260                 :      * Otherwise, if we were unable to identify any prefix characters, say
     261                 :      * NOMATCH --- the pattern is anchored left, but doesn't specify any
     262                 :      * particular first character.
     263                 :      */
     264             714 :     if (*slength > 0)
     265             626 :         return REG_PREFIX;
     266                 : 
     267              88 :     return REG_NOMATCH;
     268                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a