Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regprefix.c
4 : * Extract a common prefix, if any, from a compiled regex.
5 : *
6 : *
7 : * Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : *
10 : * IDENTIFICATION
11 : * src/backend/regex/regprefix.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "regex/regguts.h"
17 :
18 :
19 : /*
20 : * forward declarations
21 : */
22 : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : chr *string, size_t *slength);
24 :
25 :
26 : /*
27 : * pg_regprefix - get common prefix for regular expression
28 : *
29 : * Returns one of:
30 : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : * REG_EXACT: all strings satisfying the regex must match the same string
33 : * or a REG_XXX error code
34 : *
35 : * In the non-failure cases, *string is set to a palloc'd string containing
36 : * the common prefix or exact value, of length *slength (measured in chrs
37 : * not bytes!).
38 : *
39 : * This function does not analyze all complex cases (such as lookaround
40 : * constraints) exactly. Therefore it is possible that some strings matching
41 : * the reported prefix or exact-match string do not satisfy the regex. But
42 : * it should never be the case that a string satisfying the regex does not
43 : * match the reported prefix or exact-match string.
44 : */
45 : int
3925 tgl 46 CBC 6449 : pg_regprefix(regex_t *re,
47 : chr **string,
48 : size_t *slength)
49 : {
50 : struct guts *g;
51 : struct cnfa *cnfa;
52 : int st;
53 :
54 : /* sanity checks */
55 6449 : if (string == NULL || slength == NULL)
3925 tgl 56 UBC 0 : return REG_INVARG;
3925 tgl 57 CBC 6449 : *string = NULL; /* initialize for failure cases */
58 6449 : *slength = 0;
59 6449 : if (re == NULL || re->re_magic != REMAGIC)
3925 tgl 60 UBC 0 : return REG_INVARG;
3925 tgl 61 CBC 6449 : if (re->re_csize != sizeof(chr))
3925 tgl 62 UBC 0 : return REG_MIXED;
63 :
64 : /* Initialize locale-dependent support */
3925 tgl 65 CBC 6449 : pg_set_regex_collation(re->re_collation);
66 :
67 : /* setup */
68 6449 : g = (struct guts *) re->re_guts;
69 6449 : if (g->info & REG_UIMPOSSIBLE)
70 1 : return REG_NOMATCH;
71 :
72 : /*
73 : * This implementation considers only the search NFA for the topmost regex
74 : * tree node. Therefore, constraints such as backrefs are not fully
75 : * applied, which is allowed per the function's API spec.
76 : */
77 6448 : assert(g->tree != NULL);
78 6448 : cnfa = &g->tree->cnfa;
79 :
80 : /* matchall NFAs never have a fixed prefix */
778 81 6448 : if (cnfa->flags & MATCHALL)
82 6 : return REG_NOMATCH;
83 :
84 : /*
85 : * Since a correct NFA should never contain any exit-free loops, it should
86 : * not be possible for our traversal to return to a previously visited NFA
87 : * state. Hence we need at most nstates chrs in the output string.
88 : */
3925 89 6442 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
90 6442 : if (*string == NULL)
3925 tgl 91 UBC 0 : return REG_ESPACE;
92 :
93 : /* do it */
3925 tgl 94 CBC 6442 : st = findprefix(cnfa, &g->cmap, *string, slength);
95 :
96 6442 : assert(*slength <= cnfa->nstates);
97 :
98 : /* clean up */
99 6442 : if (st != REG_PREFIX && st != REG_EXACT)
100 : {
101 357 : FREE(*string);
102 357 : *string = NULL;
103 357 : *slength = 0;
104 : }
105 :
106 6442 : return st;
107 : }
108 :
109 : /*
110 : * findprefix - extract common prefix from cNFA
111 : *
112 : * Results are returned into the preallocated chr array string[], with
113 : * *slength (which must be preset to zero) incremented for each chr.
114 : */
115 : static int /* regprefix return code */
2118 116 6442 : findprefix(struct cnfa *cnfa,
117 : struct colormap *cm,
118 : chr *string,
119 : size_t *slength)
120 : {
121 : int st;
122 : int nextst;
123 : color thiscolor;
124 : chr c;
125 : struct carc *ca;
126 :
127 : /*
128 : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
129 : * anchored left. If we have both BOS and BOL, they must go to the same
130 : * next state.
131 : */
3925 132 6442 : st = cnfa->pre;
133 6442 : nextst = -1;
134 12621 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
135 : {
136 6448 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
137 : {
138 6185 : if (nextst == -1)
139 6179 : nextst = ca->to;
140 6 : else if (nextst != ca->to)
141 6 : return REG_NOMATCH;
142 : }
143 : else
144 263 : return REG_NOMATCH;
145 : }
146 6173 : if (nextst == -1)
3925 tgl 147 UBC 0 : return REG_NOMATCH;
148 :
149 : /*
150 : * Scan through successive states, stopping as soon as we find one with
151 : * more than one acceptable transition character (either multiple colors
152 : * on out-arcs, or a color with more than one member chr).
153 : *
154 : * We could find a state with multiple out-arcs that are all labeled with
155 : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
156 : * In that case we add the chr "c" to the output string but then exit the
157 : * loop with nextst == -1. This leaves a little bit on the table: if the
158 : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
159 : * to the prefix. But chasing multiple parallel state chains doesn't seem
160 : * worth the trouble.
161 : */
162 : do
163 : {
3925 tgl 164 CBC 79814 : st = nextst;
165 79814 : nextst = -1;
166 79814 : thiscolor = COLORLESS;
167 153535 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
168 : {
169 : /* We can ignore BOS/BOL arcs */
170 79835 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
3925 tgl 171 UBC 0 : continue;
172 :
173 : /*
174 : * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
175 : * and LACONs
176 : */
2729 tgl 177 CBC 79835 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
778 178 74376 : ca->co == RAINBOW || ca->co >= cnfa->ncolors)
179 : {
3925 180 6105 : thiscolor = COLORLESS;
181 6105 : break;
182 : }
183 73730 : if (thiscolor == COLORLESS)
184 : {
185 : /* First plain outarc */
186 73715 : thiscolor = ca->co;
187 73715 : nextst = ca->to;
188 : }
189 15 : else if (thiscolor == ca->co)
190 : {
191 : /* Another plain outarc for same color */
192 6 : nextst = -1;
193 : }
194 : else
195 : {
196 : /* More than one plain outarc color terminates the search */
197 9 : thiscolor = COLORLESS;
198 9 : break;
199 : }
200 : }
201 : /* Done if we didn't find exactly one color on plain outarcs */
202 79814 : if (thiscolor == COLORLESS)
203 6114 : break;
204 : /* The color must be a singleton */
2407 205 73700 : if (cm->cd[thiscolor].nschrs != 1)
206 53 : break;
207 : /* Must not have any high-color-map entries */
208 73647 : if (cm->cd[thiscolor].nuchrs != 0)
3925 tgl 209 UBC 0 : break;
210 :
211 : /*
212 : * Identify the color's sole member chr and add it to the prefix
213 : * string. In general the colormap data structure doesn't provide a
214 : * way to find color member chrs, except by trying GETCOLOR() on each
215 : * possible chr value, which won't do at all. However, for the cases
216 : * we care about it should be sufficient to test the "firstchr" value,
217 : * that is the first chr ever added to the color. There are cases
218 : * where this might no longer be a member of the color (so we do need
219 : * to test), but none of them are likely to arise for a character that
220 : * is a member of a common prefix. If we do hit such a corner case,
221 : * we just fall out without adding anything to the prefix string.
222 : */
3925 tgl 223 CBC 73647 : c = cm->cd[thiscolor].firstchr;
224 73647 : if (GETCOLOR(cm, c) != thiscolor)
3925 tgl 225 UBC 0 : break;
226 :
3925 tgl 227 CBC 73647 : string[(*slength)++] = c;
228 :
229 : /* Advance to next state, but only if we have a unique next state */
230 73647 : } while (nextst != -1);
231 :
232 : /*
233 : * If we ended at a state that only has EOS/EOL outarcs leading to the
234 : * "post" state, then we have an exact-match string. Note this is true
235 : * even if the string is of zero length.
236 : */
237 6173 : nextst = -1;
238 11632 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
239 : {
240 6173 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
241 : {
242 5459 : if (nextst == -1)
243 5459 : nextst = ca->to;
3925 tgl 244 UBC 0 : else if (nextst != ca->to)
245 : {
246 0 : nextst = -1;
247 0 : break;
248 : }
249 : }
250 : else
251 : {
3925 tgl 252 CBC 714 : nextst = -1;
253 714 : break;
254 : }
255 : }
256 6173 : if (nextst == cnfa->post)
257 5459 : return REG_EXACT;
258 :
259 : /*
260 : * Otherwise, if we were unable to identify any prefix characters, say
261 : * NOMATCH --- the pattern is anchored left, but doesn't specify any
262 : * particular first character.
263 : */
264 714 : if (*slength > 0)
265 626 : return REG_PREFIX;
266 :
267 88 : return REG_NOMATCH;
268 : }
|