Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The oprisdelim and is_tsquery
24 : * flags are both true or both false in current usage, but we keep them
25 : * separate for clarity.
26 : *
27 : * If oprisdelim is set, the following characters are treated as delimiters
28 : * (in addition to whitespace): ! | & ( )
29 : *
30 : * is_tsquery affects *only* the content of error messages.
31 : *
32 : * is_web can be true to further modify tsquery parsing.
33 : *
34 : * If escontext is an ErrorSaveContext node, then soft errors can be
35 : * captured there rather than being thrown.
36 : */
37 : struct TSVectorParseStateData
38 : {
39 : char *prsbuf; /* next input character */
40 : char *bufstart; /* whole string (used only for errors) */
41 : char *word; /* buffer to hold the current word */
42 : int len; /* size in bytes allocated for 'word' */
43 : int eml; /* max bytes per character */
44 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
45 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
46 : bool is_web; /* we're in websearch_to_tsquery() */
47 : Node *escontext; /* for soft error reporting */
48 : };
49 :
50 :
51 : /*
52 : * Initializes a parser state object for the given input string.
53 : * A bitmask of flags (see ts_utils.h) and an error context object
54 : * can be provided as well.
55 : */
56 : TSVectorParseState
103 tgl 57 GNC 3816 : init_tsvector_parser(char *input, int flags, Node *escontext)
58 : {
59 : TSVectorParseState state;
60 :
5693 teodor 61 GIC 3816 : state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
62 3816 : state->prsbuf = input;
5649 tgl 63 3816 : state->bufstart = input;
5693 teodor 64 3816 : state->len = 32;
65 3816 : state->word = (char *) palloc(state->len);
5649 tgl 66 3816 : state->eml = pg_database_encoding_max_length();
1830 teodor 67 3816 : state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
1830 teodor 68 CBC 3816 : state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
1830 teodor 69 GIC 3816 : state->is_web = (flags & P_TSV_IS_WEB) != 0;
103 tgl 70 GNC 3816 : state->escontext = escontext;
71 :
5693 teodor 72 GIC 3816 : return state;
5693 teodor 73 ECB : }
74 :
75 : /*
5649 tgl 76 : * Reinitializes parser to parse 'input', instead of previous input.
77 : *
78 : * Note that bufstart (the string reported in errors) is not changed.
5693 teodor 79 : */
80 : void
5693 teodor 81 CBC 4065 : reset_tsvector_parser(TSVectorParseState state, char *input)
5693 teodor 82 ECB : {
5624 bruce 83 CBC 4065 : state->prsbuf = input;
5693 teodor 84 4065 : }
85 :
5693 teodor 86 ECB : /*
87 : * Shuts down a tsvector parser.
88 : */
89 : void
5693 teodor 90 GIC 3813 : close_tsvector_parser(TSVectorParseState state)
91 : {
92 3813 : pfree(state->word);
93 3813 : pfree(state);
94 3813 : }
5693 teodor 95 ECB :
96 : /* increase the size of 'word' if needed to hold one more character */
97 : #define RESIZEPRSBUF \
98 : do { \
99 : int clen = curpos - state->word; \
100 : if ( clen + state->eml >= state->len ) \
101 : { \
102 : state->len *= 2; \
103 : state->word = (char *) repalloc(state->word, state->len); \
104 : curpos = state->word + clen; \
105 : } \
106 : } while (0)
107 :
5649 tgl 108 : /* Fills gettoken_tsvector's output parameters, and returns true */
109 : #define RETURN_TOKEN \
110 : do { \
111 : if (pos_ptr != NULL) \
112 : { \
113 : *pos_ptr = pos; \
114 : *poslen = npos; \
115 : } \
116 : else if (pos != NULL) \
117 : pfree(pos); \
118 : \
119 : if (strval != NULL) \
120 : *strval = state->word; \
121 : if (lenval != NULL) \
122 : *lenval = curpos - state->word; \
123 : if (endptr != NULL) \
124 : *endptr = state->prsbuf; \
125 : return true; \
126 : } while(0)
127 :
128 :
129 : /* State codes used in gettoken_tsvector */
130 : #define WAITWORD 1
131 : #define WAITENDWORD 2
132 : #define WAITNEXTCHAR 3
133 : #define WAITENDCMPLX 4
134 : #define WAITPOSINFO 5
135 : #define INPOSINFO 6
136 : #define WAITPOSDELIM 7
137 : #define WAITCHARCMPLX 8
138 :
139 : #define PRSSYNTAXERROR return prssyntaxerror(state)
140 :
141 : static bool
5649 tgl 142 GIC 9 : prssyntaxerror(TSVectorParseState state)
143 : {
103 tgl 144 GNC 9 : errsave(state->escontext,
145 : (errcode(ERRCODE_SYNTAX_ERROR),
146 : state->is_tsquery ?
147 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
148 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
149 : /* In soft error situation, return false as convenience for caller */
150 6 : return false;
151 : }
152 :
153 :
154 : /*
155 : * Get next token from string being parsed. Returns true if successful,
156 : * false if end of input string is reached or soft error.
157 : *
158 : * On success, these output parameters are filled in:
5624 bruce 159 ECB : *
160 : * *strval pointer to token
161 : * *lenval length of *strval
162 : * *pos_ptr pointer to a palloc'd array of positions and weights
163 : * associated with the token. If the caller is not interested
164 : * in the information, NULL can be supplied. Otherwise
165 : * the caller is responsible for pfreeing the array.
166 : * *poslen number of elements in *pos_ptr
5649 tgl 167 : * *endptr scan resumption point
168 : *
169 : * Pass NULL for any unwanted output parameters.
170 : *
171 : * If state->escontext is an ErrorSaveContext, then caller must check
172 : * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
173 : * error or normal end-of-string.
174 : */
175 : bool
5624 bruce 176 GIC 96365 : gettoken_tsvector(TSVectorParseState state,
177 : char **strval, int *lenval,
178 : WordEntryPos **pos_ptr, int *poslen,
179 : char **endptr)
180 : {
181 96365 : int oldstate = 0;
182 96365 : char *curpos = state->word;
183 96365 : int statecode = WAITWORD;
184 :
185 : /*
186 : * pos is for collecting the comma delimited list of positions followed by
187 : * the actual token.
188 : */
5693 teodor 189 96365 : WordEntryPos *pos = NULL;
5624 bruce 190 96365 : int npos = 0; /* elements of pos used */
191 96365 : int posalen = 0; /* allocated size of pos */
192 :
193 : while (1)
194 : {
5693 teodor 195 392574 : if (statecode == WAITWORD)
196 : {
5693 teodor 197 CBC 185055 : if (*(state->prsbuf) == '\0')
5693 teodor 198 GIC 1898 : return false;
1830 199 183157 : else if (!state->is_web && t_iseq(state->prsbuf, '\''))
5693 200 81 : statecode = WAITENDCMPLX;
1830 201 183076 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
5693 teodor 202 ECB : {
5693 teodor 203 CBC 3 : statecode = WAITNEXTCHAR;
204 3 : oldstate = WAITENDWORD;
205 : }
1830 teodor 206 GIC 183073 : else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
207 183073 : (state->is_web && t_iseq(state->prsbuf, '"')))
5649 tgl 208 UIC 0 : PRSSYNTAXERROR;
5693 teodor 209 GIC 183073 : else if (!t_isspace(state->prsbuf))
5693 teodor 210 ECB : {
5693 teodor 211 CBC 94383 : COPYCHAR(curpos, state->prsbuf);
212 94383 : curpos += pg_mblen(state->prsbuf);
5693 teodor 213 GIC 94383 : statecode = WAITENDWORD;
214 : }
215 : }
5693 teodor 216 CBC 207519 : else if (statecode == WAITNEXTCHAR)
217 : {
218 81 : if (*(state->prsbuf) == '\0')
103 tgl 219 UNC 0 : ereturn(state->escontext, false,
5693 teodor 220 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
5649 tgl 221 : errmsg("there is no escaped character: \"%s\"",
222 : state->bufstart)));
223 : else
5693 teodor 224 : {
5693 teodor 225 CBC 81 : RESIZEPRSBUF;
5693 teodor 226 GIC 81 : COPYCHAR(curpos, state->prsbuf);
5693 teodor 227 CBC 81 : curpos += pg_mblen(state->prsbuf);
228 81 : Assert(oldstate != 0);
5693 teodor 229 GBC 81 : statecode = oldstate;
5693 teodor 230 ECB : }
231 : }
5693 teodor 232 CBC 207438 : else if (statecode == WAITENDWORD)
5693 teodor 233 ECB : {
1830 teodor 234 CBC 191625 : if (!state->is_web && t_iseq(state->prsbuf, '\\'))
235 : {
5693 teodor 236 GIC 36 : statecode = WAITNEXTCHAR;
5693 teodor 237 CBC 36 : oldstate = WAITENDWORD;
238 : }
239 191589 : else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
1830 teodor 240 GBC 103404 : (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
1830 teodor 241 GIC 102516 : (state->is_web && t_iseq(state->prsbuf, '"')))
242 : {
5693 243 89079 : RESIZEPRSBUF;
244 89079 : if (curpos == state->word)
5649 tgl 245 UIC 0 : PRSSYNTAXERROR;
5693 teodor 246 CBC 89079 : *(curpos) = '\0';
247 89079 : RETURN_TOKEN;
5693 teodor 248 ECB : }
5693 teodor 249 CBC 102510 : else if (t_iseq(state->prsbuf, ':'))
5693 teodor 250 ECB : {
5693 teodor 251 GIC 5307 : if (curpos == state->word)
5649 tgl 252 UIC 0 : PRSSYNTAXERROR;
5693 teodor 253 CBC 5307 : *(curpos) = '\0';
5693 teodor 254 GIC 5307 : if (state->oprisdelim)
5693 teodor 255 CBC 348 : RETURN_TOKEN;
256 : else
257 4959 : statecode = INPOSINFO;
5693 teodor 258 ECB : }
259 : else
260 : {
5693 teodor 261 CBC 97203 : RESIZEPRSBUF;
262 97203 : COPYCHAR(curpos, state->prsbuf);
5693 teodor 263 GIC 97203 : curpos += pg_mblen(state->prsbuf);
5693 teodor 264 ECB : }
265 : }
5693 teodor 266 GBC 15813 : else if (statecode == WAITENDCMPLX)
5693 teodor 267 ECB : {
1830 teodor 268 CBC 462 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
269 : {
5693 270 81 : statecode = WAITCHARCMPLX;
271 : }
1830 272 381 : else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
5693 teodor 273 EUB : {
5693 teodor 274 CBC 42 : statecode = WAITNEXTCHAR;
275 42 : oldstate = WAITENDCMPLX;
5693 teodor 276 ECB : }
5693 teodor 277 GIC 339 : else if (*(state->prsbuf) == '\0')
5649 tgl 278 LBC 0 : PRSSYNTAXERROR;
279 : else
280 : {
5693 teodor 281 GIC 339 : RESIZEPRSBUF;
5693 teodor 282 CBC 339 : COPYCHAR(curpos, state->prsbuf);
283 339 : curpos += pg_mblen(state->prsbuf);
5693 teodor 284 ECB : }
285 : }
5693 teodor 286 GIC 15351 : else if (statecode == WAITCHARCMPLX)
5693 teodor 287 ECB : {
1830 teodor 288 GIC 81 : if (!state->is_web && t_iseq(state->prsbuf, '\''))
5693 teodor 289 ECB : {
5693 teodor 290 UIC 0 : RESIZEPRSBUF;
5693 teodor 291 LBC 0 : COPYCHAR(curpos, state->prsbuf);
5693 teodor 292 UIC 0 : curpos += pg_mblen(state->prsbuf);
5693 teodor 293 LBC 0 : statecode = WAITENDCMPLX;
294 : }
5693 teodor 295 ECB : else
296 : {
5693 teodor 297 GIC 81 : RESIZEPRSBUF;
5693 teodor 298 CBC 81 : *(curpos) = '\0';
5693 teodor 299 GBC 81 : if (curpos == state->word)
5649 tgl 300 GIC 9 : PRSSYNTAXERROR;
5693 teodor 301 72 : if (state->oprisdelim)
5693 teodor 302 ECB : {
303 : /* state->prsbuf+=pg_mblen(state->prsbuf); */
5693 teodor 304 CBC 33 : RETURN_TOKEN;
305 : }
306 : else
307 39 : statecode = WAITPOSINFO;
5693 teodor 308 GIC 39 : continue; /* recheck current character */
5693 teodor 309 ECB : }
310 : }
5693 teodor 311 GBC 15270 : else if (statecode == WAITPOSINFO)
5693 teodor 312 EUB : {
5693 teodor 313 GBC 39 : if (t_iseq(state->prsbuf, ':'))
5693 teodor 314 UBC 0 : statecode = INPOSINFO;
315 : else
5693 teodor 316 GIC 39 : RETURN_TOKEN;
317 : }
5693 teodor 318 CBC 15231 : else if (statecode == INPOSINFO)
5693 teodor 319 ECB : {
5693 teodor 320 CBC 5262 : if (t_isdigit(state->prsbuf))
5693 teodor 321 ECB : {
5693 teodor 322 CBC 5262 : if (posalen == 0)
323 : {
5693 teodor 324 GIC 4959 : posalen = 4;
5693 teodor 325 CBC 4959 : pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
5693 teodor 326 GIC 4959 : npos = 0;
327 : }
5693 teodor 328 CBC 303 : else if (npos + 1 >= posalen)
5693 teodor 329 ECB : {
5693 teodor 330 GIC 57 : posalen *= 2;
331 57 : pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
5693 teodor 332 ECB : }
5693 teodor 333 GIC 5262 : npos++;
5693 teodor 334 CBC 5262 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
5649 tgl 335 EUB : /* we cannot get here in tsquery, so no need for 2 errmsgs */
5693 teodor 336 GIC 5262 : if (WEP_GETPOS(pos[npos - 1]) == 0)
103 tgl 337 UNC 0 : ereturn(state->escontext, false,
338 : (errcode(ERRCODE_SYNTAX_ERROR),
5649 tgl 339 ECB : errmsg("wrong position info in tsvector: \"%s\"",
340 : state->bufstart)));
5693 teodor 341 CBC 5262 : WEP_SETWEIGHT(pos[npos - 1], 0);
5693 teodor 342 GIC 5262 : statecode = WAITPOSDELIM;
5693 teodor 343 ECB : }
344 : else
5649 tgl 345 LBC 0 : PRSSYNTAXERROR;
5693 teodor 346 ECB : }
5693 teodor 347 CBC 9969 : else if (statecode == WAITPOSDELIM)
348 : {
349 9969 : if (t_iseq(state->prsbuf, ','))
5693 teodor 350 GIC 303 : statecode = INPOSINFO;
5693 teodor 351 CBC 9666 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
5693 teodor 352 ECB : {
5693 teodor 353 GIC 210 : if (WEP_GETWEIGHT(pos[npos - 1]))
5649 tgl 354 LBC 0 : PRSSYNTAXERROR;
5693 teodor 355 CBC 210 : WEP_SETWEIGHT(pos[npos - 1], 3);
356 : }
357 9456 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
5693 teodor 358 EUB : {
5693 teodor 359 GIC 108 : if (WEP_GETWEIGHT(pos[npos - 1]))
5649 tgl 360 UIC 0 : PRSSYNTAXERROR;
5693 teodor 361 GIC 108 : WEP_SETWEIGHT(pos[npos - 1], 2);
5693 teodor 362 ECB : }
5693 teodor 363 CBC 9348 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
364 : {
5693 teodor 365 GIC 138 : if (WEP_GETWEIGHT(pos[npos - 1]))
5649 tgl 366 UBC 0 : PRSSYNTAXERROR;
5693 teodor 367 GIC 138 : WEP_SETWEIGHT(pos[npos - 1], 1);
5693 teodor 368 ECB : }
5693 teodor 369 GIC 9210 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
5693 teodor 370 ECB : {
5693 teodor 371 CBC 66 : if (WEP_GETWEIGHT(pos[npos - 1]))
5649 tgl 372 LBC 0 : PRSSYNTAXERROR;
5693 teodor 373 GIC 66 : WEP_SETWEIGHT(pos[npos - 1], 0);
5693 teodor 374 ECB : }
5693 teodor 375 GBC 9144 : else if (t_isspace(state->prsbuf) ||
5693 teodor 376 CBC 4401 : *(state->prsbuf) == '\0')
5693 teodor 377 GIC 4959 : RETURN_TOKEN;
5693 teodor 378 CBC 4185 : else if (!t_isdigit(state->prsbuf))
5649 tgl 379 UIC 0 : PRSSYNTAXERROR;
5693 teodor 380 ECB : }
2118 tgl 381 EUB : else /* internal error */
5611 tgl 382 LBC 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
383 : statecode);
5693 teodor 384 ECB :
385 : /* get next char */
5693 teodor 386 CBC 296170 : state->prsbuf += pg_mblen(state->prsbuf);
5693 teodor 387 EUB : }
5693 teodor 388 ECB : }
|