Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * regprefix.c
4 : : * Extract a common prefix, if any, from a compiled regex.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 2012-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/regex/regprefix.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "regex/regguts.h"
17 : :
18 : :
19 : : /*
20 : : * forward declarations
21 : : */
22 : : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : : chr *string, size_t *slength);
24 : :
25 : :
26 : : /*
27 : : * pg_regprefix - get common prefix for regular expression
28 : : *
29 : : * Returns one of:
30 : : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : : * REG_EXACT: all strings satisfying the regex must match the same string
33 : : * or a REG_XXX error code
34 : : *
35 : : * In the non-failure cases, *string is set to a palloc'd string containing
36 : : * the common prefix or exact value, of length *slength (measured in chrs
37 : : * not bytes!).
38 : : *
39 : : * This function does not analyze all complex cases (such as lookaround
40 : : * constraints) exactly. Therefore it is possible that some strings matching
41 : : * the reported prefix or exact-match string do not satisfy the regex. But
42 : : * it should never be the case that a string satisfying the regex does not
43 : : * match the reported prefix or exact-match string.
44 : : */
45 : : int
4296 tgl@sss.pgh.pa.us 46 :CBC 7177 : pg_regprefix(regex_t *re,
47 : : chr **string,
48 : : size_t *slength)
49 : : {
50 : : struct guts *g;
51 : : struct cnfa *cnfa;
52 : : int st;
53 : :
54 : : /* sanity checks */
55 [ + - - + ]: 7177 : if (string == NULL || slength == NULL)
4296 tgl@sss.pgh.pa.us 56 :UBC 0 : return REG_INVARG;
4296 tgl@sss.pgh.pa.us 57 :CBC 7177 : *string = NULL; /* initialize for failure cases */
58 : 7177 : *slength = 0;
59 [ + - - + ]: 7177 : if (re == NULL || re->re_magic != REMAGIC)
4296 tgl@sss.pgh.pa.us 60 :UBC 0 : return REG_INVARG;
4296 tgl@sss.pgh.pa.us 61 [ - + ]:CBC 7177 : if (re->re_csize != sizeof(chr))
4296 tgl@sss.pgh.pa.us 62 :UBC 0 : return REG_MIXED;
63 : :
64 : : /* Initialize locale-dependent support */
4296 tgl@sss.pgh.pa.us 65 :CBC 7177 : pg_set_regex_collation(re->re_collation);
66 : :
67 : : /* setup */
68 : 7177 : g = (struct guts *) re->re_guts;
69 [ + + ]: 7177 : if (g->info & REG_UIMPOSSIBLE)
70 : 1 : return REG_NOMATCH;
71 : :
72 : : /*
73 : : * This implementation considers only the search NFA for the topmost regex
74 : : * tree node. Therefore, constraints such as backrefs are not fully
75 : : * applied, which is allowed per the function's API spec.
76 : : */
77 [ - + ]: 7176 : assert(g->tree != NULL);
78 : 7176 : cnfa = &g->tree->cnfa;
79 : :
80 : : /* matchall NFAs never have a fixed prefix */
1149 81 [ + + ]: 7176 : if (cnfa->flags & MATCHALL)
82 : 6 : return REG_NOMATCH;
83 : :
84 : : /*
85 : : * Since a correct NFA should never contain any exit-free loops, it should
86 : : * not be possible for our traversal to return to a previously visited NFA
87 : : * state. Hence we need at most nstates chrs in the output string.
88 : : */
4296 89 : 7170 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 [ - + ]: 7170 : if (*string == NULL)
4296 tgl@sss.pgh.pa.us 91 :UBC 0 : return REG_ESPACE;
92 : :
93 : : /* do it */
4296 tgl@sss.pgh.pa.us 94 :CBC 7170 : st = findprefix(cnfa, &g->cmap, *string, slength);
95 : :
96 [ - + ]: 7170 : assert(*slength <= cnfa->nstates);
97 : :
98 : : /* clean up */
99 [ + + + + ]: 7170 : if (st != REG_PREFIX && st != REG_EXACT)
100 : : {
101 : 352 : FREE(*string);
102 : 352 : *string = NULL;
103 : 352 : *slength = 0;
104 : : }
105 : :
106 : 7170 : return st;
107 : : }
108 : :
109 : : /*
110 : : * findprefix - extract common prefix from cNFA
111 : : *
112 : : * Results are returned into the preallocated chr array string[], with
113 : : * *slength (which must be preset to zero) incremented for each chr.
114 : : */
115 : : static int /* regprefix return code */
2489 116 : 7170 : findprefix(struct cnfa *cnfa,
117 : : struct colormap *cm,
118 : : chr *string,
119 : : size_t *slength)
120 : : {
121 : : int st;
122 : : int nextst;
123 : : color thiscolor;
124 : : chr c;
125 : : struct carc *ca;
126 : :
127 : : /*
128 : : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129 : : * anchored left. If we have both BOS and BOL, they must go to the same
130 : : * next state.
131 : : */
4296 132 : 7170 : st = cnfa->pre;
133 : 7170 : nextst = -1;
134 [ + + ]: 14081 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135 : : {
136 [ + - + + ]: 7176 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137 : : {
138 [ + + ]: 6917 : if (nextst == -1)
139 : 6911 : nextst = ca->to;
140 [ + - ]: 6 : else if (nextst != ca->to)
141 : 6 : return REG_NOMATCH;
142 : : }
143 : : else
144 : 259 : return REG_NOMATCH;
145 : : }
146 [ - + ]: 6905 : if (nextst == -1)
4296 tgl@sss.pgh.pa.us 147 :UBC 0 : return REG_NOMATCH;
148 : :
149 : : /*
150 : : * Scan through successive states, stopping as soon as we find one with
151 : : * more than one acceptable transition character (either multiple colors
152 : : * on out-arcs, or a color with more than one member chr).
153 : : *
154 : : * We could find a state with multiple out-arcs that are all labeled with
155 : : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156 : : * In that case we add the chr "c" to the output string but then exit the
157 : : * loop with nextst == -1. This leaves a little bit on the table: if the
158 : : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159 : : * to the prefix. But chasing multiple parallel state chains doesn't seem
160 : : * worth the trouble.
161 : : */
162 : : do
163 : : {
4296 tgl@sss.pgh.pa.us 164 :CBC 88879 : st = nextst;
165 : 88879 : nextst = -1;
166 : 88879 : thiscolor = COLORLESS;
167 [ + + ]: 170923 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168 : : {
169 : : /* We can ignore BOS/BOL arcs */
170 [ + - - + ]: 88900 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
4296 tgl@sss.pgh.pa.us 171 :UBC 0 : continue;
172 : :
173 : : /*
174 : : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175 : : * and LACONs
176 : : */
3100 tgl@sss.pgh.pa.us 177 [ + - + + ]:CBC 88900 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
1149 178 [ + + + + ]: 82989 : ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179 : : {
4296 180 : 6847 : thiscolor = COLORLESS;
181 : 6847 : break;
182 : : }
183 [ + + ]: 82053 : if (thiscolor == COLORLESS)
184 : : {
185 : : /* First plain outarc */
186 : 82038 : thiscolor = ca->co;
187 : 82038 : nextst = ca->to;
188 : : }
189 [ + + ]: 15 : else if (thiscolor == ca->co)
190 : : {
191 : : /* Another plain outarc for same color */
192 : 6 : nextst = -1;
193 : : }
194 : : else
195 : : {
196 : : /* More than one plain outarc color terminates the search */
197 : 9 : thiscolor = COLORLESS;
198 : 9 : break;
199 : : }
200 : : }
201 : : /* Done if we didn't find exactly one color on plain outarcs */
202 [ + + ]: 88879 : if (thiscolor == COLORLESS)
203 : 6856 : break;
204 : : /* The color must be a singleton */
2778 205 [ + + ]: 82023 : if (cm->cd[thiscolor].nschrs != 1)
206 : 43 : break;
207 : : /* Must not have any high-color-map entries */
208 [ - + ]: 81980 : if (cm->cd[thiscolor].nuchrs != 0)
4296 tgl@sss.pgh.pa.us 209 :UBC 0 : break;
210 : :
211 : : /*
212 : : * Identify the color's sole member chr and add it to the prefix
213 : : * string. In general the colormap data structure doesn't provide a
214 : : * way to find color member chrs, except by trying GETCOLOR() on each
215 : : * possible chr value, which won't do at all. However, for the cases
216 : : * we care about it should be sufficient to test the "firstchr" value,
217 : : * that is the first chr ever added to the color. There are cases
218 : : * where this might no longer be a member of the color (so we do need
219 : : * to test), but none of them are likely to arise for a character that
220 : : * is a member of a common prefix. If we do hit such a corner case,
221 : : * we just fall out without adding anything to the prefix string.
222 : : */
4296 tgl@sss.pgh.pa.us 223 :CBC 81980 : c = cm->cd[thiscolor].firstchr;
224 [ + - - + ]: 81980 : if (GETCOLOR(cm, c) != thiscolor)
4296 tgl@sss.pgh.pa.us 225 :UBC 0 : break;
226 : :
4296 tgl@sss.pgh.pa.us 227 :CBC 81980 : string[(*slength)++] = c;
228 : :
229 : : /* Advance to next state, but only if we have a unique next state */
230 [ + + ]: 81980 : } while (nextst != -1);
231 : :
232 : : /*
233 : : * If we ended at a state that only has EOS/EOL outarcs leading to the
234 : : * "post" state, then we have an exact-match string. Note this is true
235 : : * even if the string is of zero length.
236 : : */
237 : 6905 : nextst = -1;
238 [ + + ]: 12816 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239 : : {
240 [ + - + + ]: 6905 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241 : : {
242 [ + - ]: 5911 : if (nextst == -1)
243 : 5911 : nextst = ca->to;
4296 tgl@sss.pgh.pa.us 244 [ # # ]:UBC 0 : else if (nextst != ca->to)
245 : : {
246 : 0 : nextst = -1;
247 : 0 : break;
248 : : }
249 : : }
250 : : else
251 : : {
4296 tgl@sss.pgh.pa.us 252 :CBC 994 : nextst = -1;
253 : 994 : break;
254 : : }
255 : : }
256 [ + + ]: 6905 : if (nextst == cnfa->post)
257 : 5911 : return REG_EXACT;
258 : :
259 : : /*
260 : : * Otherwise, if we were unable to identify any prefix characters, say
261 : : * NOMATCH --- the pattern is anchored left, but doesn't specify any
262 : : * particular first character.
263 : : */
264 [ + + ]: 994 : if (*slength > 0)
265 : 907 : return REG_PREFIX;
266 : :
267 : 87 : return REG_NOMATCH;
268 : : }
|