TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 : * flags are both true or both false in current usage, but we keep them
25 : * separate for clarity.
26 : *
27 : * If oprisdelim is set, the following characters are treated as delimiters
28 : * (in addition to whitespace): ! | & ( )
29 : *
30 : * is_tsquery affects *only* the content of error messages.
31 : *
32 : * is_web can be true to further modify tsquery parsing.
33 : *
34 : * If escontext is an ErrorSaveContext node, then soft errors can be
35 : * captured there rather than being thrown.
36 : */
37 : struct TSVectorParseStateData
38 : {
39 : char *prsbuf; /* next input character */
40 : char *bufstart; /* whole string (used only for errors) */
41 : char *word; /* buffer to hold the current word */
42 : int len; /* size in bytes allocated for 'word' */
43 : int eml; /* max bytes per character */
44 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 : bool is_web; /* we're in websearch_to_tsquery() */
47 : Node *escontext; /* for soft error reporting */
48 : };
49 :
50 :
51 : /*
52 : * Initializes a parser state object for the given input string.
53 : * A bitmask of flags (see ts_utils.h) and an error context object
54 : * can be provided as well.
55 : */
56 : TSVectorParseState
57 GNC 3816 : init_tsvector_parser(char *input, int flags, Node *escontext)
58 : {
59 : TSVectorParseState state;
60 :
61 GIC 3816 : state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
62 3816 : state->prsbuf = input;
63 3816 : state->bufstart = input;
64 3816 : state->len = 32;
65 3816 : state->word = (char *) palloc(state->len);
66 3816 : state->eml = pg_database_encoding_max_length();
67 3816 : state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
68 CBC 3816 : state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
69 GIC 3816 : state->is_web = (flags & P_TSV_IS_WEB) != 0;
70 GNC 3816 : state->escontext = escontext;
71 :
72 GIC 3816 : return state;
73 ECB : }
74 :
75 : /*
76 : * Reinitializes parser to parse 'input', instead of previous input.
77 : *
78 : * Note that bufstart (the string reported in errors) is not changed.
79 : */
80 : void
81 CBC 4065 : reset_tsvector_parser(TSVectorParseState state, char *input)
82 ECB : {
83 CBC 4065 : state->prsbuf = input;
84 4065 : }
85 :
86 ECB : /*
87 : * Shuts down a tsvector parser.
88 : */
89 : void
90 GIC 3813 : close_tsvector_parser(TSVectorParseState state)
91 : {
92 3813 : pfree(state->word);
93 3813 : pfree(state);
94 3813 : }
95 ECB :
96 : /* increase the size of 'word' if needed to hold one more character */
97 : #define RESIZEPRSBUF \
98 : do { \
99 : int clen = curpos - state->word; \
100 : if ( clen + state->eml >= state->len ) \
101 : { \
102 : state->len *= 2; \
103 : state->word = (char *) repalloc(state->word, state->len); \
104 : curpos = state->word + clen; \
105 : } \
106 : } while (0)
107 :
108 : /* Fills gettoken_tsvector's output parameters, and returns true */
109 : #define RETURN_TOKEN \
110 : do { \
111 : if (pos_ptr != NULL) \
112 : { \
113 : *pos_ptr = pos; \
114 : *poslen = npos; \
115 : } \
116 : else if (pos != NULL) \
117 : pfree(pos); \
118 : \
119 : if (strval != NULL) \
120 : *strval = state->word; \
121 : if (lenval != NULL) \
122 : *lenval = curpos - state->word; \
123 : if (endptr != NULL) \
124 : *endptr = state->prsbuf; \
125 : return true; \
126 : } while(0)
127 :
128 :
129 : /* State codes used in gettoken_tsvector */
130 : #define WAITWORD 1
131 : #define WAITENDWORD 2
132 : #define WAITNEXTCHAR 3
133 : #define WAITENDCMPLX 4
134 : #define WAITPOSINFO 5
135 : #define INPOSINFO 6
136 : #define WAITPOSDELIM 7
137 : #define WAITCHARCMPLX 8
138 :
139 : #define PRSSYNTAXERROR return prssyntaxerror(state)
140 :
141 : static bool
142 GIC 9 : prssyntaxerror(TSVectorParseState state)
143 : {
144 GNC 9 : errsave(state->escontext,
145 : (errcode(ERRCODE_SYNTAX_ERROR),
146 : state->is_tsquery ?
147 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 : /* In soft error situation, return false as convenience for caller */
150 6 : return false;
151 : }
152 :
153 :
154 : /*
155 : * Get next token from string being parsed. Returns true if successful,
156 : * false if end of input string is reached or soft error.
157 : *
158 : * On success, these output parameters are filled in:
159 ECB : *
160 : * *strval pointer to token
161 : * *lenval length of *strval
162 : * *pos_ptr pointer to a palloc'd array of positions and weights
163 : * associated with the token. If the caller is not interested
164 : * in the information, NULL can be supplied. Otherwise
165 : * the caller is responsible for pfreeing the array.
166 : * *poslen number of elements in *pos_ptr
167 : * *endptr scan resumption point
168 : *
169 : * Pass NULL for any unwanted output parameters.
170 : *
171 : * If state->escontext is an ErrorSaveContext, then caller must check
172 : * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 : * error or normal end-of-string.
174 : */
175 : bool
176 GIC 96365 : gettoken_tsvector(TSVectorParseState state,
177 : char **strval, int *lenval,
178 : WordEntryPos **pos_ptr, int *poslen,
179 : char **endptr)
180 : {
181 96365 : int oldstate = 0;
182 96365 : char *curpos = state->word;
183 96365 : int statecode = WAITWORD;
184 :
185 : /*
186 : * pos is for collecting the comma delimited list of positions followed by
187 : * the actual token.
188 : */
189 96365 : WordEntryPos *pos = NULL;
190 96365 : int npos = 0; /* elements of pos used */
191 96365 : int posalen = 0; /* allocated size of pos */
192 :
193 : while (1)
194 : {
195 392574 : if (statecode == WAITWORD)
196 : {
197 CBC 185055 : if (*(state->prsbuf) == '\0')
198 GIC 1898 : return false;
199 183157 : else if (!state->is_web && t_iseq(state->prsbuf, '\''))
200 81 : statecode = WAITENDCMPLX;
201 183076 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
202 ECB : {
203 CBC 3 : statecode = WAITNEXTCHAR;
204 3 : oldstate = WAITENDWORD;
205 : }
206 GIC 183073 : else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 183073 : (state->is_web && t_iseq(state->prsbuf, '"')))
208 UIC 0 : PRSSYNTAXERROR;
209 GIC 183073 : else if (!t_isspace(state->prsbuf))
210 ECB : {
211 CBC 94383 : COPYCHAR(curpos, state->prsbuf);
212 94383 : curpos += pg_mblen(state->prsbuf);
213 GIC 94383 : statecode = WAITENDWORD;
214 : }
215 : }
216 CBC 207519 : else if (statecode == WAITNEXTCHAR)
217 : {
218 81 : if (*(state->prsbuf) == '\0')
219 UNC 0 : ereturn(state->escontext, false,
220 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
221 : errmsg("there is no escaped character: \"%s\"",
222 : state->bufstart)));
223 : else
224 : {
225 CBC 81 : RESIZEPRSBUF;
226 GIC 81 : COPYCHAR(curpos, state->prsbuf);
227 CBC 81 : curpos += pg_mblen(state->prsbuf);
228 81 : Assert(oldstate != 0);
229 GBC 81 : statecode = oldstate;
230 ECB : }
231 : }
232 CBC 207438 : else if (statecode == WAITENDWORD)
233 ECB : {
234 CBC 191625 : if (!state->is_web && t_iseq(state->prsbuf, '\\'))
235 : {
236 GIC 36 : statecode = WAITNEXTCHAR;
237 CBC 36 : oldstate = WAITENDWORD;
238 : }
239 191589 : else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
240 GBC 103404 : (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
241 GIC 102516 : (state->is_web && t_iseq(state->prsbuf, '"')))
242 : {
243 89079 : RESIZEPRSBUF;
244 89079 : if (curpos == state->word)
245 UIC 0 : PRSSYNTAXERROR;
246 CBC 89079 : *(curpos) = '\0';
247 89079 : RETURN_TOKEN;
248 ECB : }
249 CBC 102510 : else if (t_iseq(state->prsbuf, ':'))
250 ECB : {
251 GIC 5307 : if (curpos == state->word)
252 UIC 0 : PRSSYNTAXERROR;
253 CBC 5307 : *(curpos) = '\0';
254 GIC 5307 : if (state->oprisdelim)
255 CBC 348 : RETURN_TOKEN;
256 : else
257 4959 : statecode = INPOSINFO;
258 ECB : }
259 : else
260 : {
261 CBC 97203 : RESIZEPRSBUF;
262 97203 : COPYCHAR(curpos, state->prsbuf);
263 GIC 97203 : curpos += pg_mblen(state->prsbuf);
264 ECB : }
265 : }
266 GBC 15813 : else if (statecode == WAITENDCMPLX)
267 ECB : {
268 CBC 462 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
269 : {
270 81 : statecode = WAITCHARCMPLX;
271 : }
272 381 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
273 EUB : {
274 CBC 42 : statecode = WAITNEXTCHAR;
275 42 : oldstate = WAITENDCMPLX;
276 ECB : }
277 GIC 339 : else if (*(state->prsbuf) == '\0')
278 LBC 0 : PRSSYNTAXERROR;
279 : else
280 : {
281 GIC 339 : RESIZEPRSBUF;
282 CBC 339 : COPYCHAR(curpos, state->prsbuf);
283 339 : curpos += pg_mblen(state->prsbuf);
284 ECB : }
285 : }
286 GIC 15351 : else if (statecode == WAITCHARCMPLX)
287 ECB : {
288 GIC 81 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
289 ECB : {
290 UIC 0 : RESIZEPRSBUF;
291 LBC 0 : COPYCHAR(curpos, state->prsbuf);
292 UIC 0 : curpos += pg_mblen(state->prsbuf);
293 LBC 0 : statecode = WAITENDCMPLX;
294 : }
295 ECB : else
296 : {
297 GIC 81 : RESIZEPRSBUF;
298 CBC 81 : *(curpos) = '\0';
299 GBC 81 : if (curpos == state->word)
300 GIC 9 : PRSSYNTAXERROR;
301 72 : if (state->oprisdelim)
302 ECB : {
303 : /* state->prsbuf+=pg_mblen(state->prsbuf); */
304 CBC 33 : RETURN_TOKEN;
305 : }
306 : else
307 39 : statecode = WAITPOSINFO;
308 GIC 39 : continue; /* recheck current character */
309 ECB : }
310 : }
311 GBC 15270 : else if (statecode == WAITPOSINFO)
312 EUB : {
313 GBC 39 : if (t_iseq(state->prsbuf, ':'))
314 UBC 0 : statecode = INPOSINFO;
315 : else
316 GIC 39 : RETURN_TOKEN;
317 : }
318 CBC 15231 : else if (statecode == INPOSINFO)
319 ECB : {
320 CBC 5262 : if (t_isdigit(state->prsbuf))
321 ECB : {
322 CBC 5262 : if (posalen == 0)
323 : {
324 GIC 4959 : posalen = 4;
325 CBC 4959 : pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
326 GIC 4959 : npos = 0;
327 : }
328 CBC 303 : else if (npos + 1 >= posalen)
329 ECB : {
330 GIC 57 : posalen *= 2;
331 57 : pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
332 ECB : }
333 GIC 5262 : npos++;
334 CBC 5262 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
335 EUB : /* we cannot get here in tsquery, so no need for 2 errmsgs */
336 GIC 5262 : if (WEP_GETPOS(pos[npos - 1]) == 0)
337 UNC 0 : ereturn(state->escontext, false,
338 : (errcode(ERRCODE_SYNTAX_ERROR),
339 ECB : errmsg("wrong position info in tsvector: \"%s\"",
340 : state->bufstart)));
341 CBC 5262 : WEP_SETWEIGHT(pos[npos - 1], 0);
342 GIC 5262 : statecode = WAITPOSDELIM;
343 ECB : }
344 : else
345 LBC 0 : PRSSYNTAXERROR;
346 ECB : }
347 CBC 9969 : else if (statecode == WAITPOSDELIM)
348 : {
349 9969 : if (t_iseq(state->prsbuf, ','))
350 GIC 303 : statecode = INPOSINFO;
351 CBC 9666 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
352 ECB : {
353 GIC 210 : if (WEP_GETWEIGHT(pos[npos - 1]))
354 LBC 0 : PRSSYNTAXERROR;
355 CBC 210 : WEP_SETWEIGHT(pos[npos - 1], 3);
356 : }
357 9456 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
358 EUB : {
359 GIC 108 : if (WEP_GETWEIGHT(pos[npos - 1]))
360 UIC 0 : PRSSYNTAXERROR;
361 GIC 108 : WEP_SETWEIGHT(pos[npos - 1], 2);
362 ECB : }
363 CBC 9348 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
364 : {
365 GIC 138 : if (WEP_GETWEIGHT(pos[npos - 1]))
366 UBC 0 : PRSSYNTAXERROR;
367 GIC 138 : WEP_SETWEIGHT(pos[npos - 1], 1);
368 ECB : }
369 GIC 9210 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
370 ECB : {
371 CBC 66 : if (WEP_GETWEIGHT(pos[npos - 1]))
372 LBC 0 : PRSSYNTAXERROR;
373 GIC 66 : WEP_SETWEIGHT(pos[npos - 1], 0);
374 ECB : }
375 GBC 9144 : else if (t_isspace(state->prsbuf) ||
376 CBC 4401 : *(state->prsbuf) == '\0')
377 GIC 4959 : RETURN_TOKEN;
378 CBC 4185 : else if (!t_isdigit(state->prsbuf))
379 UIC 0 : PRSSYNTAXERROR;
380 ECB : }
381 EUB : else /* internal error */
382 LBC 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
383 : statecode);
384 ECB :
385 : /* get next char */
386 CBC 296170 : state->prsbuf += pg_mblen(state->prsbuf);
387 EUB : }
388 ECB : }
|