Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * regexport.c
4 : : * Functions for exporting info about a regex's NFA
5 : : *
6 : : * In this implementation, the NFA defines a necessary but not sufficient
7 : : * condition for a string to match the regex: that is, there can be strings
8 : : * that match the NFA but don't match the full regex, but not vice versa.
9 : : * Thus, for example, it is okay for the functions below to treat lookaround
10 : : * constraints as no-ops, since they merely constrain the string some more.
11 : : *
12 : : * Notice that these functions return info into caller-provided arrays
13 : : * rather than doing their own malloc's. This simplifies the APIs by
14 : : * eliminating a class of error conditions, and in the case of colors
15 : : * allows the caller to decide how big is too big to bother with.
16 : : *
17 : : *
18 : : * Portions Copyright (c) 2013-2024, PostgreSQL Global Development Group
19 : : * Portions Copyright (c) 1998, 1999 Henry Spencer
20 : : *
21 : : * IDENTIFICATION
22 : : * src/backend/regex/regexport.c
23 : : *
24 : : *-------------------------------------------------------------------------
25 : : */
26 : :
27 : : #include "regex/regguts.h"
28 : :
29 : : #include "regex/regexport.h"
30 : :
31 : :
32 : : /*
33 : : * Get total number of NFA states.
34 : : */
35 : : int
4023 tgl@sss.pgh.pa.us 36 :UBC 0 : pg_reg_getnumstates(const regex_t *regex)
37 : : {
38 : : struct cnfa *cnfa;
39 : :
40 [ # # # # ]: 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
41 : 0 : cnfa = &((struct guts *) regex->re_guts)->search;
42 : :
43 : 0 : return cnfa->nstates;
44 : : }
45 : :
46 : : /*
47 : : * Get initial state of NFA.
48 : : */
49 : : int
4023 tgl@sss.pgh.pa.us 50 :CBC 65 : pg_reg_getinitialstate(const regex_t *regex)
51 : : {
52 : : struct cnfa *cnfa;
53 : :
54 [ + - - + ]: 65 : assert(regex != NULL && regex->re_magic == REMAGIC);
55 : 65 : cnfa = &((struct guts *) regex->re_guts)->search;
56 : :
57 : 65 : return cnfa->pre;
58 : : }
59 : :
60 : : /*
61 : : * Get final state of NFA.
62 : : */
63 : : int
64 : 963 : pg_reg_getfinalstate(const regex_t *regex)
65 : : {
66 : : struct cnfa *cnfa;
67 : :
68 [ + - - + ]: 963 : assert(regex != NULL && regex->re_magic == REMAGIC);
69 : 963 : cnfa = &((struct guts *) regex->re_guts)->search;
70 : :
71 : 963 : return cnfa->post;
72 : : }
73 : :
74 : : /*
75 : : * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
76 : : * arcs from the caller, treating any LACON as being automatically satisfied.
77 : : * Since the output representation does not support arcs that consume no
78 : : * character when traversed, we have to recursively traverse LACON arcs here,
79 : : * and report whatever normal arcs are reachable by traversing LACON arcs.
80 : : * Note that this wouldn't work if it were possible to reach the final state
81 : : * via LACON traversal, but the regex library never builds NFAs that have
82 : : * LACON arcs leading directly to the final state. (This is because the
83 : : * regex executor is designed to consume one character beyond the nominal
84 : : * match end --- possibly an EOS indicator --- so there is always a set of
85 : : * ordinary arcs leading to the final state.)
86 : : *
87 : : * traverse_lacons is a recursive subroutine used by both exported functions
88 : : * to count and then emit the reachable regular arcs. *arcs_count is
89 : : * incremented by the number of reachable arcs, and as many as will fit in
90 : : * arcs_len (possibly 0) are emitted into arcs[].
91 : : */
92 : : static void
2489 93 : 3290 : traverse_lacons(struct cnfa *cnfa, int st,
94 : : int *arcs_count,
95 : : regex_arc_t *arcs, int arcs_len)
96 : : {
97 : : struct carc *ca;
98 : :
99 : : /*
100 : : * Since this function recurses, it could theoretically be driven to stack
101 : : * overflow. In practice, this is mostly useful to backstop against a
102 : : * failure of the regex compiler to remove a loop of LACON arcs.
103 : : */
2558 104 : 3290 : check_stack_depth();
105 : :
106 [ + + ]: 8668 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
107 : : {
108 [ + + ]: 5378 : if (ca->co < cnfa->ncolors)
109 : : {
110 : : /* Ordinary arc, so count and possibly emit it */
111 : 5372 : int ndx = (*arcs_count)++;
112 : :
113 [ + + ]: 5372 : if (ndx < arcs_len)
114 : : {
115 : 2686 : arcs[ndx].co = ca->co;
116 : 2686 : arcs[ndx].to = ca->to;
117 : : }
118 : : }
119 : : else
120 : : {
121 : : /* LACON arc --- assume it's satisfied and recurse... */
122 : : /* ... but first, assert it doesn't lead directly to post state */
123 [ - + ]: 6 : Assert(ca->to != cnfa->post);
124 : :
125 : 6 : traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
126 : : }
127 : : }
128 : 3290 : }
129 : :
130 : : /*
131 : : * Get number of outgoing NFA arcs of state number "st".
132 : : */
133 : : int
4023 134 : 1644 : pg_reg_getnumoutarcs(const regex_t *regex, int st)
135 : : {
136 : : struct cnfa *cnfa;
137 : : int arcs_count;
138 : :
139 [ + - - + ]: 1644 : assert(regex != NULL && regex->re_magic == REMAGIC);
140 : 1644 : cnfa = &((struct guts *) regex->re_guts)->search;
141 : :
142 [ + - - + ]: 1644 : if (st < 0 || st >= cnfa->nstates)
4023 tgl@sss.pgh.pa.us 143 :UBC 0 : return 0;
2558 tgl@sss.pgh.pa.us 144 :CBC 1644 : arcs_count = 0;
145 : 1644 : traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
146 : 1644 : return arcs_count;
147 : : }
148 : :
149 : : /*
150 : : * Write array of outgoing NFA arcs of state number "st" into arcs[],
151 : : * whose length arcs_len must be at least as long as indicated by
152 : : * pg_reg_getnumoutarcs(), else not all arcs will be returned.
153 : : */
154 : : void
4023 155 : 1644 : pg_reg_getoutarcs(const regex_t *regex, int st,
156 : : regex_arc_t *arcs, int arcs_len)
157 : : {
158 : : struct cnfa *cnfa;
159 : : int arcs_count;
160 : :
161 [ + - - + ]: 1644 : assert(regex != NULL && regex->re_magic == REMAGIC);
162 : 1644 : cnfa = &((struct guts *) regex->re_guts)->search;
163 : :
164 [ + - + - : 1644 : if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
+ + ]
165 : 4 : return;
2558 166 : 1640 : arcs_count = 0;
167 : 1640 : traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
168 : : }
169 : :
170 : : /*
171 : : * Get total number of colors.
172 : : */
173 : : int
4023 174 : 65 : pg_reg_getnumcolors(const regex_t *regex)
175 : : {
176 : : struct colormap *cm;
177 : :
178 [ + - - + ]: 65 : assert(regex != NULL && regex->re_magic == REMAGIC);
179 : 65 : cm = &((struct guts *) regex->re_guts)->cmap;
180 : :
181 : 65 : return cm->max + 1;
182 : : }
183 : :
184 : : /*
185 : : * Check if color is beginning of line/string.
186 : : *
187 : : * (We might at some point need to offer more refined handling of pseudocolors,
188 : : * but this will do for now.)
189 : : */
190 : : int
191 : 1505 : pg_reg_colorisbegin(const regex_t *regex, int co)
192 : : {
193 : : struct cnfa *cnfa;
194 : :
195 [ + - - + ]: 1505 : assert(regex != NULL && regex->re_magic == REMAGIC);
196 : 1505 : cnfa = &((struct guts *) regex->re_guts)->search;
197 : :
198 [ + + + + ]: 1505 : if (co == cnfa->bos[0] || co == cnfa->bos[1])
199 : 246 : return true;
200 : : else
201 : 1259 : return false;
202 : : }
203 : :
204 : : /*
205 : : * Check if color is end of line/string.
206 : : */
207 : : int
208 : 1259 : pg_reg_colorisend(const regex_t *regex, int co)
209 : : {
210 : : struct cnfa *cnfa;
211 : :
212 [ + - - + ]: 1259 : assert(regex != NULL && regex->re_magic == REMAGIC);
213 : 1259 : cnfa = &((struct guts *) regex->re_guts)->search;
214 : :
215 [ + + + + ]: 1259 : if (co == cnfa->eos[0] || co == cnfa->eos[1])
216 : 162 : return true;
217 : : else
218 : 1097 : return false;
219 : : }
220 : :
221 : : /*
222 : : * Get number of member chrs of color number "co".
223 : : *
224 : : * Note: we return -1 if the color number is invalid, or if it is a special
225 : : * color (WHITE, RAINBOW, or a pseudocolor), or if the number of members is
226 : : * uncertain.
227 : : * Callers should not try to extract the members if -1 is returned.
228 : : */
229 : : int
230 : 531 : pg_reg_getnumcharacters(const regex_t *regex, int co)
231 : : {
232 : : struct colormap *cm;
233 : :
234 [ + - - + ]: 531 : assert(regex != NULL && regex->re_magic == REMAGIC);
235 : 531 : cm = &((struct guts *) regex->re_guts)->cmap;
236 : :
1149 237 [ + + - + ]: 531 : if (co <= 0 || co > cm->max) /* <= 0 rejects WHITE and RAINBOW */
4023 238 : 65 : return -1;
2489 239 [ + + ]: 466 : if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
4023 240 : 260 : return -1;
241 : :
242 : : /*
243 : : * If the color appears anywhere in the high colormap, treat its number of
244 : : * members as uncertain. In principle we could determine all the specific
245 : : * chrs corresponding to each such entry, but it would be expensive
246 : : * (particularly if character class tests are required) and it doesn't
247 : : * seem worth it.
248 : : */
2778 249 [ - + ]: 206 : if (cm->cd[co].nuchrs != 0)
2778 tgl@sss.pgh.pa.us 250 :UBC 0 : return -1;
251 : :
252 : : /* OK, return the known number of member chrs */
2778 tgl@sss.pgh.pa.us 253 :CBC 206 : return cm->cd[co].nschrs;
254 : : }
255 : :
256 : : /*
257 : : * Write array of member chrs of color number "co" into chars[],
258 : : * whose length chars_len must be at least as long as indicated by
259 : : * pg_reg_getnumcharacters(), else not all chars will be returned.
260 : : *
261 : : * Fetching the members of WHITE, RAINBOW, or a pseudocolor is not supported.
262 : : *
263 : : * Caution: this is a relatively expensive operation.
264 : : */
265 : : void
4023 266 : 206 : pg_reg_getcharacters(const regex_t *regex, int co,
267 : : pg_wchar *chars, int chars_len)
268 : : {
269 : : struct colormap *cm;
270 : : chr c;
271 : :
272 [ + - - + ]: 206 : assert(regex != NULL && regex->re_magic == REMAGIC);
273 : 206 : cm = &((struct guts *) regex->re_guts)->cmap;
274 : :
275 [ + - + - : 206 : if (co <= 0 || co > cm->max || chars_len <= 0)
- + ]
4023 tgl@sss.pgh.pa.us 276 :UBC 0 : return;
4023 tgl@sss.pgh.pa.us 277 [ - + ]:CBC 206 : if (cm->cd[co].flags & PSEUDO)
4023 tgl@sss.pgh.pa.us 278 :UBC 0 : return;
279 : :
280 : : /*
281 : : * We need only examine the low character map; there should not be any
282 : : * matching entries in the high map.
283 : : */
2778 tgl@sss.pgh.pa.us 284 [ + - ]:CBC 20208 : for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
285 : : {
286 [ + + ]: 20208 : if (cm->locolormap[c - CHR_MIN] == co)
287 : : {
288 : 785 : *chars++ = c;
289 [ + + ]: 785 : if (--chars_len == 0)
290 : 206 : break;
291 : : }
292 : : }
293 : : }
|