TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * Note that the grammar is not allowed to perform any table access
7 : * (since we need to be able to do basic parsing even while inside an
8 : * aborted transaction). Therefore, the data structures returned by
9 : * the grammar are "raw" parsetrees that still need to be analyzed by
10 : * analyze.c and related files.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/backend/parser/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres.h"
23 :
24 : #include "mb/pg_wchar.h"
25 : #include "gramparse.h"
26 : #include "parser/parser.h"
27 : #include "parser/scansup.h"
28 :
29 : static bool check_uescapechar(unsigned char escape);
30 : static char *str_udeescape(const char *str, char escape,
31 : int position, core_yyscan_t yyscanner);
32 :
33 :
34 : /*
35 : * raw_parser
36 : * Given a query in string form, do lexical and grammatical analysis.
37 : *
38 : * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 : * list have the form required by the specified RawParseMode.
40 : */
41 : List *
42 CBC 498144 : raw_parser(const char *str, RawParseMode mode)
43 : {
44 : core_yyscan_t yyscanner;
45 : base_yy_extra_type yyextra;
46 : int yyresult;
47 :
48 : /* initialize the flex scanner */
49 498144 : yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 : &ScanKeywords, ScanKeywordTokens);
51 :
52 : /* base_yylex() only needs us to initialize the lookahead token, if any */
53 498144 : if (mode == RAW_PARSE_DEFAULT)
54 476010 : yyextra.have_lookahead = false;
55 : else
56 : {
57 : /* this array is indexed by RawParseMode enum */
58 : static const int mode_token[] = {
59 : 0, /* RAW_PARSE_DEFAULT */
60 : MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
61 : MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
62 : MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
63 : MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
64 : MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
65 : };
66 :
67 22134 : yyextra.have_lookahead = true;
68 22134 : yyextra.lookahead_token = mode_token[mode];
69 22134 : yyextra.lookahead_yylloc = 0;
70 22134 : yyextra.lookahead_end = NULL;
71 : }
72 :
73 : /* initialize the bison parser */
74 498144 : parser_init(&yyextra);
75 :
76 : /* Parse! */
77 498144 : yyresult = base_yyparse(yyscanner);
78 :
79 : /* Clean up (release memory) */
80 497597 : scanner_finish(yyscanner);
81 :
82 497597 : if (yyresult) /* error */
83 UBC 0 : return NIL;
84 :
85 CBC 497597 : return yyextra.parsetree;
86 : }
87 :
88 :
89 : /*
90 : * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 : *
92 : * This filter is needed because in some cases the standard SQL grammar
93 : * requires more than one token lookahead. We reduce these cases to one-token
94 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 : *
96 : * Using a filter is simpler than trying to recognize multiword tokens
97 : * directly in scan.l, because we'd have to allow for comments between the
98 : * words. Furthermore it's not clear how to do that without re-introducing
99 : * scanner backtrack, which would cost more performance than this filter
100 : * layer does.
101 : *
102 : * We also use this filter to convert UIDENT and USCONST sequences into
103 : * plain IDENT and SCONST tokens. While that could be handled by additional
104 : * productions in the main grammar, it's more efficient to do it like this.
105 : *
106 : * The filter also provides a convenient place to translate between
107 : * the core_YYSTYPE and YYSTYPE representations (which are really the
108 : * same thing anyway, but notationally they're different).
109 : */
110 : int
111 16891748 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 : {
113 16891748 : base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 : int cur_token;
115 : int next_token;
116 : int cur_token_length;
117 : YYLTYPE cur_yylloc;
118 :
119 : /* Get next token --- we might already have it */
120 16891748 : if (yyextra->have_lookahead)
121 : {
122 90571 : cur_token = yyextra->lookahead_token;
123 90571 : lvalp->core_yystype = yyextra->lookahead_yylval;
124 90571 : *llocp = yyextra->lookahead_yylloc;
125 90571 : if (yyextra->lookahead_end)
126 68437 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
127 90571 : yyextra->have_lookahead = false;
128 : }
129 : else
130 16801177 : cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 :
132 : /*
133 : * If this token isn't one that requires lookahead, just return it. If it
134 : * does, determine the token length. (We could get that via strlen(), but
135 : * since we have such a small set of possibilities, hardwiring seems
136 : * feasible and more efficient --- at least for the fixed-length cases.)
137 : */
138 16891631 : switch (cur_token)
139 : {
140 GNC 1265 : case FORMAT:
141 1265 : cur_token_length = 6;
142 1265 : break;
143 CBC 33598 : case NOT:
144 33598 : cur_token_length = 3;
145 33598 : break;
146 1148 : case NULLS_P:
147 1148 : cur_token_length = 5;
148 1148 : break;
149 31974 : case WITH:
150 31974 : cur_token_length = 4;
151 31974 : break;
152 133 : case UIDENT:
153 ECB : case USCONST:
154 CBC 133 : cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 133 : break;
156 GNC 379 : case WITHOUT:
157 379 : cur_token_length = 7;
158 379 : break;
159 GIC 16823134 : default:
160 CBC 16823134 : return cur_token;
161 ECB : }
162 :
163 : /*
164 : * Identify end+1 of current token. core_yylex() has temporarily stored a
165 : * '\0' here, and will undo that when we call it again. We need to redo
166 : * it to fully revert the lookahead call for error reporting purposes.
167 : */
168 GIC 68497 : yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 68497 : *llocp + cur_token_length;
170 68497 : Assert(*(yyextra->lookahead_end) == '\0');
171 :
172 : /*
173 : * Save and restore *llocp around the call. It might look like we could
174 ECB : * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 : * does not work because flex actually holds onto the last-passed pointer
176 : * internally, and will use that for error reporting. We need any error
177 : * reports to point to the current token, not the next one.
178 : */
179 GIC 68497 : cur_yylloc = *llocp;
180 :
181 : /* Get next token, saving outputs into lookahead variables */
182 68497 : next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 68497 : yyextra->lookahead_token = next_token;
184 68497 : yyextra->lookahead_yylloc = *llocp;
185 ECB :
186 GIC 68497 : *llocp = cur_yylloc;
187 :
188 ECB : /* Now revert the un-truncation of the current token */
189 CBC 68497 : yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 68497 : *(yyextra->lookahead_end) = '\0';
191 :
192 68497 : yyextra->have_lookahead = true;
193 :
194 : /* Replace cur_token if needed, based on lookahead */
195 68497 : switch (cur_token)
196 ECB : {
197 GNC 1265 : case FORMAT:
198 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 : switch (next_token)
200 : {
201 187 : case JSON:
202 187 : cur_token = FORMAT_LA;
203 187 : break;
204 : }
205 1265 : break;
206 :
207 GIC 33598 : case NOT:
208 ECB : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 : switch (next_token)
210 : {
211 CBC 2985 : case BETWEEN:
212 : case IN_P:
213 ECB : case LIKE:
214 : case ILIKE:
215 : case SIMILAR:
216 GIC 2985 : cur_token = NOT_LA;
217 CBC 2985 : break;
218 ECB : }
219 CBC 33598 : break;
220 :
221 1148 : case NULLS_P:
222 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
223 ECB : switch (next_token)
224 : {
225 GIC 794 : case FIRST_P:
226 : case LAST_P:
227 CBC 794 : cur_token = NULLS_LA;
228 GIC 794 : break;
229 : }
230 1148 : break;
231 :
232 CBC 31974 : case WITH:
233 ECB : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 : switch (next_token)
235 : {
236 GIC 1647 : case TIME:
237 ECB : case ORDINALITY:
238 GIC 1647 : cur_token = WITH_LA;
239 1647 : break;
240 : }
241 CBC 31974 : break;
242 :
243 GNC 379 : case WITHOUT:
244 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 : switch (next_token)
246 : {
247 250 : case TIME:
248 250 : cur_token = WITHOUT_LA;
249 250 : break;
250 : }
251 379 : break;
252 :
253 CBC 133 : case UIDENT:
254 ECB : case USCONST:
255 : /* Look ahead for UESCAPE */
256 CBC 133 : if (next_token == UESCAPE)
257 : {
258 ECB : /* Yup, so get third token, which had better be SCONST */
259 : const char *escstr;
260 :
261 : /* Again save and restore *llocp */
262 CBC 22 : cur_yylloc = *llocp;
263 :
264 ECB : /* Un-truncate current token so errors point to third token */
265 CBC 22 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 :
267 ECB : /* Get third token */
268 GIC 22 : next_token = core_yylex(&(yyextra->lookahead_yylval),
269 ECB : llocp, yyscanner);
270 :
271 : /* If we throw error here, it will point to third token */
272 GIC 22 : if (next_token != SCONST)
273 CBC 3 : scanner_yyerror("UESCAPE must be followed by a simple string literal",
274 ECB : yyscanner);
275 :
276 GIC 19 : escstr = yyextra->lookahead_yylval.str;
277 CBC 19 : if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
278 GIC 3 : scanner_yyerror("invalid Unicode escape character",
279 ECB : yyscanner);
280 :
281 : /* Now restore *llocp; errors will point to first token */
282 CBC 16 : *llocp = cur_yylloc;
283 :
284 : /* Apply Unicode conversion */
285 GIC 16 : lvalp->core_yystype.str =
286 16 : str_udeescape(lvalp->core_yystype.str,
287 16 : escstr[0],
288 ECB : *llocp,
289 : yyscanner);
290 :
291 : /*
292 : * We don't need to revert the un-truncation of UESCAPE. What
293 : * we do want to do is clear have_lookahead, thereby consuming
294 : * all three tokens.
295 : */
296 GIC 16 : yyextra->have_lookahead = false;
297 : }
298 ECB : else
299 : {
300 : /* No UESCAPE, so convert using default escape character */
301 GIC 87 : lvalp->core_yystype.str =
302 CBC 111 : str_udeescape(lvalp->core_yystype.str,
303 ECB : '\\',
304 : *llocp,
305 : yyscanner);
306 : }
307 :
308 CBC 103 : if (cur_token == UIDENT)
309 : {
310 : /* It's an identifier, so truncate as appropriate */
311 13 : truncate_identifier(lvalp->core_yystype.str,
312 13 : strlen(lvalp->core_yystype.str),
313 ECB : true);
314 GIC 13 : cur_token = IDENT;
315 : }
316 90 : else if (cur_token == USCONST)
317 : {
318 90 : cur_token = SCONST;
319 : }
320 103 : break;
321 : }
322 ECB :
323 GIC 68467 : return cur_token;
324 : }
325 :
326 : /* convert hex digit (caller should have verified that) to value */
327 ECB : static unsigned int
328 CBC 794 : hexval(unsigned char c)
329 : {
330 GIC 794 : if (c >= '0' && c <= '9')
331 669 : return c - '0';
332 125 : if (c >= 'a' && c <= 'f')
333 30 : return c - 'a' + 0xA;
334 CBC 95 : if (c >= 'A' && c <= 'F')
335 GIC 95 : return c - 'A' + 0xA;
336 UIC 0 : elog(ERROR, "invalid hexadecimal digit");
337 ECB : return 0; /* not reached */
338 : }
339 :
340 : /* is Unicode code point acceptable? */
341 : static void
342 CBC 190 : check_unicode_value(pg_wchar c)
343 : {
344 190 : if (!is_valid_unicode_codepoint(c))
345 GIC 3 : ereport(ERROR,
346 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
347 : errmsg("invalid Unicode escape value")));
348 GIC 187 : }
349 ECB :
350 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 : static bool
352 GIC 19 : check_uescapechar(unsigned char escape)
353 : {
354 CBC 19 : if (isxdigit(escape)
355 GIC 19 : || escape == '+'
356 CBC 16 : || escape == '\''
357 16 : || escape == '"'
358 16 : || scanner_isspace(escape))
359 3 : return false;
360 ECB : else
361 CBC 16 : return true;
362 EUB : }
363 :
364 : /*
365 : * Process Unicode escapes in "str", producing a palloc'd plain string
366 : *
367 : * escape: the escape character to use
368 ECB : * position: start position of U&'' or U&"" string token
369 : * yyscanner: context information needed for error reports
370 : */
371 : static char *
372 GIC 127 : str_udeescape(const char *str, char escape,
373 : int position, core_yyscan_t yyscanner)
374 ECB : {
375 : const char *in;
376 : char *new,
377 : *out;
378 : size_t new_len;
379 GIC 127 : pg_wchar pair_first = 0;
380 ECB : ScannerCallbackState scbstate;
381 :
382 : /*
383 : * Guesstimate that result will be no longer than input, but allow enough
384 : * padding for Unicode conversion.
385 : */
386 GIC 127 : new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
387 CBC 127 : new = palloc(new_len);
388 :
389 GIC 127 : in = str;
390 127 : out = new;
391 696 : while (*in)
392 : {
393 : /* Enlarge string if needed */
394 590 : size_t out_dist = out - new;
395 :
396 590 : if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 : {
398 LBC 0 : new_len *= 2;
399 UIC 0 : new = repalloc(new, new_len);
400 0 : out = new + out_dist;
401 : }
402 :
403 GIC 590 : if (in[0] == escape)
404 : {
405 ECB : /*
406 : * Any errors reported while processing this escape sequence will
407 : * have an error cursor pointing at the escape.
408 : */
409 GIC 202 : setup_scanner_errposition_callback(&scbstate, yyscanner,
410 202 : in - str + position + 3); /* 3 for U&" */
411 202 : if (in[1] == escape)
412 ECB : {
413 CBC 6 : if (pair_first)
414 GIC 3 : goto invalid_pair;
415 CBC 3 : *out++ = escape;
416 3 : in += 2;
417 ECB : }
418 GIC 196 : else if (isxdigit((unsigned char) in[1]) &&
419 176 : isxdigit((unsigned char) in[2]) &&
420 CBC 176 : isxdigit((unsigned char) in[3]) &&
421 GIC 176 : isxdigit((unsigned char) in[4]))
422 CBC 170 : {
423 : pg_wchar unicode;
424 EUB :
425 GBC 173 : unicode = (hexval(in[1]) << 12) +
426 173 : (hexval(in[2]) << 8) +
427 GIC 173 : (hexval(in[3]) << 4) +
428 173 : hexval(in[4]);
429 CBC 173 : check_unicode_value(unicode);
430 GIC 173 : if (pair_first)
431 : {
432 3 : if (is_utf16_surrogate_second(unicode))
433 : {
434 UIC 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
435 LBC 0 : pair_first = 0;
436 ECB : }
437 : else
438 GIC 3 : goto invalid_pair;
439 ECB : }
440 CBC 170 : else if (is_utf16_surrogate_second(unicode))
441 LBC 0 : goto invalid_pair;
442 ECB :
443 GIC 170 : if (is_utf16_surrogate_first(unicode))
444 CBC 12 : pair_first = unicode;
445 ECB : else
446 : {
447 CBC 158 : pg_unicode_to_server(unicode, (unsigned char *) out);
448 158 : out += strlen(out);
449 : }
450 GIC 170 : in += 5;
451 ECB : }
452 CBC 23 : else if (in[1] == '+' &&
453 20 : isxdigit((unsigned char) in[2]) &&
454 20 : isxdigit((unsigned char) in[3]) &&
455 20 : isxdigit((unsigned char) in[4]) &&
456 20 : isxdigit((unsigned char) in[5]) &&
457 GIC 20 : isxdigit((unsigned char) in[6]) &&
458 CBC 17 : isxdigit((unsigned char) in[7]))
459 GIC 11 : {
460 EUB : pg_wchar unicode;
461 :
462 GIC 17 : unicode = (hexval(in[2]) << 20) +
463 17 : (hexval(in[3]) << 16) +
464 CBC 17 : (hexval(in[4]) << 12) +
465 GIC 17 : (hexval(in[5]) << 8) +
466 CBC 17 : (hexval(in[6]) << 4) +
467 GBC 17 : hexval(in[7]);
468 GIC 17 : check_unicode_value(unicode);
469 CBC 14 : if (pair_first)
470 ECB : {
471 GIC 3 : if (is_utf16_surrogate_second(unicode))
472 : {
473 LBC 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 0 : pair_first = 0;
475 : }
476 ECB : else
477 GIC 3 : goto invalid_pair;
478 ECB : }
479 CBC 11 : else if (is_utf16_surrogate_second(unicode))
480 LBC 0 : goto invalid_pair;
481 ECB :
482 CBC 11 : if (is_utf16_surrogate_first(unicode))
483 3 : pair_first = unicode;
484 ECB : else
485 : {
486 GIC 8 : pg_unicode_to_server(unicode, (unsigned char *) out);
487 8 : out += strlen(out);
488 ECB : }
489 CBC 11 : in += 8;
490 ECB : }
491 : else
492 CBC 6 : ereport(ERROR,
493 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
494 : errmsg("invalid Unicode escape"),
495 : errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 :
497 CBC 184 : cancel_scanner_errposition_callback(&scbstate);
498 : }
499 EUB : else
500 : {
501 GIC 388 : if (pair_first)
502 3 : goto invalid_pair;
503 ECB :
504 GIC 385 : *out++ = *in++;
505 ECB : }
506 EUB : }
507 :
508 ECB : /* unfinished surrogate pair? */
509 CBC 106 : if (pair_first)
510 GIC 3 : goto invalid_pair;
511 :
512 CBC 103 : *out = '\0';
513 103 : return new;
514 :
515 ECB : /*
516 : * We might get here with the error callback active, or not. Call
517 : * scanner_errposition to make sure an error cursor appears; if the
518 : * callback is active, this is duplicative but harmless.
519 : */
520 GIC 15 : invalid_pair:
521 15 : ereport(ERROR,
522 : (errcode(ERRCODE_SYNTAX_ERROR),
523 ECB : errmsg("invalid Unicode surrogate pair"),
524 : scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 : yyscanner)));
526 : return NULL; /* keep compiler quiet */
527 : }
|