Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * Note that the grammar is not allowed to perform any table access
7 : * (since we need to be able to do basic parsing even while inside an
8 : * aborted transaction). Therefore, the data structures returned by
9 : * the grammar are "raw" parsetrees that still need to be analyzed by
10 : * analyze.c and related files.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/backend/parser/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres.h"
23 :
24 : #include "mb/pg_wchar.h"
25 : #include "gramparse.h"
26 : #include "parser/parser.h"
27 : #include "parser/scansup.h"
28 :
29 : static bool check_uescapechar(unsigned char escape);
30 : static char *str_udeescape(const char *str, char escape,
31 : int position, core_yyscan_t yyscanner);
32 :
33 :
34 : /*
35 : * raw_parser
36 : * Given a query in string form, do lexical and grammatical analysis.
37 : *
38 : * Returns a list of raw (un-analyzed) parse trees. The contents of the
39 : * list have the form required by the specified RawParseMode.
40 : */
41 : List *
825 tgl 42 CBC 498144 : raw_parser(const char *str, RawParseMode mode)
43 : {
44 : core_yyscan_t yyscanner;
45 : base_yy_extra_type yyextra;
46 : int yyresult;
47 :
48 : /* initialize the flex scanner */
4899 49 498144 : yyscanner = scanner_init(str, &yyextra.core_yy_extra,
50 : &ScanKeywords, ScanKeywordTokens);
51 :
52 : /* base_yylex() only needs us to initialize the lookahead token, if any */
825 53 498144 : if (mode == RAW_PARSE_DEFAULT)
54 476010 : yyextra.have_lookahead = false;
55 : else
56 : {
57 : /* this array is indexed by RawParseMode enum */
58 : static const int mode_token[] = {
59 : 0, /* RAW_PARSE_DEFAULT */
60 : MODE_TYPE_NAME, /* RAW_PARSE_TYPE_NAME */
61 : MODE_PLPGSQL_EXPR, /* RAW_PARSE_PLPGSQL_EXPR */
62 : MODE_PLPGSQL_ASSIGN1, /* RAW_PARSE_PLPGSQL_ASSIGN1 */
63 : MODE_PLPGSQL_ASSIGN2, /* RAW_PARSE_PLPGSQL_ASSIGN2 */
64 : MODE_PLPGSQL_ASSIGN3 /* RAW_PARSE_PLPGSQL_ASSIGN3 */
65 : };
66 :
67 22134 : yyextra.have_lookahead = true;
68 22134 : yyextra.lookahead_token = mode_token[mode];
69 22134 : yyextra.lookahead_yylloc = 0;
70 22134 : yyextra.lookahead_end = NULL;
71 : }
72 :
73 : /* initialize the bison parser */
5018 74 498144 : parser_init(&yyextra);
75 :
76 : /* Parse! */
77 498144 : yyresult = base_yyparse(yyscanner);
78 :
79 : /* Clean up (release memory) */
80 497597 : scanner_finish(yyscanner);
81 :
9173 bruce 82 497597 : if (yyresult) /* error */
8219 tgl 83 UBC 0 : return NIL;
84 :
5018 tgl 85 CBC 497597 : return yyextra.parsetree;
86 : }
87 :
88 :
89 : /*
90 : * Intermediate filter between parser and core lexer (core_yylex in scan.l).
91 : *
92 : * This filter is needed because in some cases the standard SQL grammar
93 : * requires more than one token lookahead. We reduce these cases to one-token
94 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
95 : *
96 : * Using a filter is simpler than trying to recognize multiword tokens
97 : * directly in scan.l, because we'd have to allow for comments between the
98 : * words. Furthermore it's not clear how to do that without re-introducing
99 : * scanner backtrack, which would cost more performance than this filter
100 : * layer does.
101 : *
102 : * We also use this filter to convert UIDENT and USCONST sequences into
103 : * plain IDENT and SCONST tokens. While that could be handled by additional
104 : * productions in the main grammar, it's more efficient to do it like this.
105 : *
106 : * The filter also provides a convenient place to translate between
107 : * the core_YYSTYPE and YYSTYPE representations (which are really the
108 : * same thing anyway, but notationally they're different).
109 : */
110 : int
4899 111 16891748 : base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp, core_yyscan_t yyscanner)
112 : {
5018 113 16891748 : base_yy_extra_type *yyextra = pg_yyget_extra(yyscanner);
114 : int cur_token;
115 : int next_token;
116 : int cur_token_length;
117 : YYLTYPE cur_yylloc;
118 :
119 : /* Get next token --- we might already have it */
120 16891748 : if (yyextra->have_lookahead)
121 : {
122 90571 : cur_token = yyextra->lookahead_token;
4899 123 90571 : lvalp->core_yystype = yyextra->lookahead_yylval;
5018 124 90571 : *llocp = yyextra->lookahead_yylloc;
825 125 90571 : if (yyextra->lookahead_end)
126 68437 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
5018 127 90571 : yyextra->have_lookahead = false;
128 : }
129 : else
4899 130 16801177 : cur_token = core_yylex(&(lvalp->core_yystype), llocp, yyscanner);
131 :
132 : /*
133 : * If this token isn't one that requires lookahead, just return it. If it
134 : * does, determine the token length. (We could get that via strlen(), but
135 : * since we have such a small set of possibilities, hardwiring seems
136 : * feasible and more efficient --- at least for the fixed-length cases.)
137 : */
6161 138 16891631 : switch (cur_token)
139 : {
11 alvherre 140 GNC 1265 : case FORMAT:
141 1265 : cur_token_length = 6;
142 1265 : break;
2951 tgl 143 CBC 33598 : case NOT:
144 33598 : cur_token_length = 3;
145 33598 : break;
5934 146 1148 : case NULLS_P:
2966 147 1148 : cur_token_length = 5;
148 1148 : break;
149 31974 : case WITH:
150 31974 : cur_token_length = 4;
151 31974 : break;
1182 152 133 : case UIDENT:
1182 tgl 153 ECB : case USCONST:
1182 tgl 154 CBC 133 : cur_token_length = strlen(yyextra->core_yy_extra.scanbuf + *llocp);
155 133 : break;
11 alvherre 156 GNC 379 : case WITHOUT:
157 379 : cur_token_length = 7;
158 379 : break;
2966 tgl 159 GIC 16823134 : default:
2966 tgl 160 CBC 16823134 : return cur_token;
2966 tgl 161 ECB : }
5624 bruce 162 :
2966 tgl 163 : /*
164 : * Identify end+1 of current token. core_yylex() has temporarily stored a
165 : * '\0' here, and will undo that when we call it again. We need to redo
166 : * it to fully revert the lookahead call for error reporting purposes.
167 : */
2966 tgl 168 GIC 68497 : yyextra->lookahead_end = yyextra->core_yy_extra.scanbuf +
169 68497 : *llocp + cur_token_length;
170 68497 : Assert(*(yyextra->lookahead_end) == '\0');
171 :
172 : /*
173 : * Save and restore *llocp around the call. It might look like we could
2966 tgl 174 ECB : * avoid this by just passing &lookahead_yylloc to core_yylex(), but that
175 : * does not work because flex actually holds onto the last-passed pointer
176 : * internally, and will use that for error reporting. We need any error
177 : * reports to point to the current token, not the next one.
178 : */
2966 tgl 179 GIC 68497 : cur_yylloc = *llocp;
180 :
181 : /* Get next token, saving outputs into lookahead variables */
182 68497 : next_token = core_yylex(&(yyextra->lookahead_yylval), llocp, yyscanner);
183 68497 : yyextra->lookahead_token = next_token;
184 68497 : yyextra->lookahead_yylloc = *llocp;
2966 tgl 185 ECB :
2966 tgl 186 GIC 68497 : *llocp = cur_yylloc;
187 :
2966 tgl 188 ECB : /* Now revert the un-truncation of the current token */
2966 tgl 189 CBC 68497 : yyextra->lookahead_hold_char = *(yyextra->lookahead_end);
190 68497 : *(yyextra->lookahead_end) = '\0';
191 :
192 68497 : yyextra->have_lookahead = true;
193 :
194 : /* Replace cur_token if needed, based on lookahead */
195 68497 : switch (cur_token)
2966 tgl 196 ECB : {
11 alvherre 197 GNC 1265 : case FORMAT:
198 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
199 : switch (next_token)
200 : {
201 187 : case JSON:
202 187 : cur_token = FORMAT_LA;
203 187 : break;
204 : }
205 1265 : break;
206 :
2951 tgl 207 GIC 33598 : case NOT:
2951 tgl 208 ECB : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
209 : switch (next_token)
210 : {
2951 tgl 211 CBC 2985 : case BETWEEN:
212 : case IN_P:
2951 tgl 213 ECB : case LIKE:
214 : case ILIKE:
215 : case SIMILAR:
2951 tgl 216 GIC 2985 : cur_token = NOT_LA;
2951 tgl 217 CBC 2985 : break;
2951 tgl 218 ECB : }
2951 tgl 219 CBC 33598 : break;
220 :
2966 221 1148 : case NULLS_P:
222 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
5934 tgl 223 ECB : switch (next_token)
224 : {
5934 tgl 225 GIC 794 : case FIRST_P:
226 : case LAST_P:
2966 tgl 227 CBC 794 : cur_token = NULLS_LA;
5934 tgl 228 GIC 794 : break;
229 : }
230 1148 : break;
231 :
5934 tgl 232 CBC 31974 : case WITH:
2966 tgl 233 ECB : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
234 : switch (next_token)
6161 235 : {
5276 peter_e 236 GIC 1647 : case TIME:
3541 stark 237 ECB : case ORDINALITY:
2966 tgl 238 GIC 1647 : cur_token = WITH_LA;
6161 239 1647 : break;
240 : }
6161 tgl 241 CBC 31974 : break;
242 :
11 alvherre 243 GNC 379 : case WITHOUT:
244 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
245 : switch (next_token)
246 : {
5 247 250 : case TIME:
11 248 250 : cur_token = WITHOUT_LA;
249 250 : break;
250 : }
251 379 : break;
252 :
1182 tgl 253 CBC 133 : case UIDENT:
1182 tgl 254 ECB : case USCONST:
255 : /* Look ahead for UESCAPE */
1182 tgl 256 CBC 133 : if (next_token == UESCAPE)
257 : {
1182 tgl 258 ECB : /* Yup, so get third token, which had better be SCONST */
259 : const char *escstr;
260 :
261 : /* Again save and restore *llocp */
1182 tgl 262 CBC 22 : cur_yylloc = *llocp;
263 :
1182 tgl 264 ECB : /* Un-truncate current token so errors point to third token */
1182 tgl 265 CBC 22 : *(yyextra->lookahead_end) = yyextra->lookahead_hold_char;
266 :
1182 tgl 267 ECB : /* Get third token */
1182 tgl 268 GIC 22 : next_token = core_yylex(&(yyextra->lookahead_yylval),
1182 tgl 269 ECB : llocp, yyscanner);
270 :
271 : /* If we throw error here, it will point to third token */
1182 tgl 272 GIC 22 : if (next_token != SCONST)
1182 tgl 273 CBC 3 : scanner_yyerror("UESCAPE must be followed by a simple string literal",
1182 tgl 274 ECB : yyscanner);
275 :
1182 tgl 276 GIC 19 : escstr = yyextra->lookahead_yylval.str;
1182 tgl 277 CBC 19 : if (strlen(escstr) != 1 || !check_uescapechar(escstr[0]))
1182 tgl 278 GIC 3 : scanner_yyerror("invalid Unicode escape character",
1182 tgl 279 ECB : yyscanner);
280 :
281 : /* Now restore *llocp; errors will point to first token */
1182 tgl 282 CBC 16 : *llocp = cur_yylloc;
283 :
284 : /* Apply Unicode conversion */
1182 tgl 285 GIC 16 : lvalp->core_yystype.str =
286 16 : str_udeescape(lvalp->core_yystype.str,
287 16 : escstr[0],
1182 tgl 288 ECB : *llocp,
289 : yyscanner);
290 :
291 : /*
292 : * We don't need to revert the un-truncation of UESCAPE. What
293 : * we do want to do is clear have_lookahead, thereby consuming
294 : * all three tokens.
295 : */
1182 tgl 296 GIC 16 : yyextra->have_lookahead = false;
297 : }
1182 tgl 298 ECB : else
299 : {
300 : /* No UESCAPE, so convert using default escape character */
1182 tgl 301 GIC 87 : lvalp->core_yystype.str =
1182 tgl 302 CBC 111 : str_udeescape(lvalp->core_yystype.str,
1182 tgl 303 ECB : '\\',
304 : *llocp,
305 : yyscanner);
306 : }
307 :
1182 tgl 308 CBC 103 : if (cur_token == UIDENT)
309 : {
310 : /* It's an identifier, so truncate as appropriate */
311 13 : truncate_identifier(lvalp->core_yystype.str,
312 13 : strlen(lvalp->core_yystype.str),
1182 tgl 313 ECB : true);
1182 tgl 314 GIC 13 : cur_token = IDENT;
315 : }
316 90 : else if (cur_token == USCONST)
317 : {
318 90 : cur_token = SCONST;
319 : }
320 103 : break;
321 : }
6161 tgl 322 ECB :
6161 tgl 323 GIC 68467 : return cur_token;
324 : }
325 :
326 : /* convert hex digit (caller should have verified that) to value */
1182 tgl 327 ECB : static unsigned int
1182 tgl 328 CBC 794 : hexval(unsigned char c)
329 : {
1182 tgl 330 GIC 794 : if (c >= '0' && c <= '9')
331 669 : return c - '0';
332 125 : if (c >= 'a' && c <= 'f')
333 30 : return c - 'a' + 0xA;
1182 tgl 334 CBC 95 : if (c >= 'A' && c <= 'F')
1182 tgl 335 GIC 95 : return c - 'A' + 0xA;
1182 tgl 336 UIC 0 : elog(ERROR, "invalid hexadecimal digit");
1182 tgl 337 ECB : return 0; /* not reached */
338 : }
339 :
1129 340 : /* is Unicode code point acceptable? */
341 : static void
1129 tgl 342 CBC 190 : check_unicode_value(pg_wchar c)
343 : {
344 190 : if (!is_valid_unicode_codepoint(c))
1182 tgl 345 GIC 3 : ereport(ERROR,
1182 tgl 346 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
347 : errmsg("invalid Unicode escape value")));
1182 tgl 348 GIC 187 : }
1182 tgl 349 ECB :
350 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
351 : static bool
1182 tgl 352 GIC 19 : check_uescapechar(unsigned char escape)
353 : {
1182 tgl 354 CBC 19 : if (isxdigit(escape)
1182 tgl 355 GIC 19 : || escape == '+'
1182 tgl 356 CBC 16 : || escape == '\''
357 16 : || escape == '"'
358 16 : || scanner_isspace(escape))
359 3 : return false;
1182 tgl 360 ECB : else
1182 tgl 361 CBC 16 : return true;
1182 tgl 362 EUB : }
363 :
364 : /*
365 : * Process Unicode escapes in "str", producing a palloc'd plain string
366 : *
367 : * escape: the escape character to use
1182 tgl 368 ECB : * position: start position of U&'' or U&"" string token
369 : * yyscanner: context information needed for error reports
370 : */
371 : static char *
1182 tgl 372 GIC 127 : str_udeescape(const char *str, char escape,
373 : int position, core_yyscan_t yyscanner)
1182 tgl 374 ECB : {
375 : const char *in;
376 : char *new,
377 : *out;
1129 378 : size_t new_len;
1182 tgl 379 GIC 127 : pg_wchar pair_first = 0;
1129 tgl 380 ECB : ScannerCallbackState scbstate;
1182 381 :
382 : /*
1129 383 : * Guesstimate that result will be no longer than input, but allow enough
384 : * padding for Unicode conversion.
1182 385 : */
1129 tgl 386 GIC 127 : new_len = strlen(str) + MAX_UNICODE_EQUIVALENT_STRING + 1;
1129 tgl 387 CBC 127 : new = palloc(new_len);
388 :
1182 tgl 389 GIC 127 : in = str;
390 127 : out = new;
391 696 : while (*in)
392 : {
393 : /* Enlarge string if needed */
1129 394 590 : size_t out_dist = out - new;
395 :
396 590 : if (out_dist > new_len - (MAX_UNICODE_EQUIVALENT_STRING + 1))
397 : {
1129 tgl 398 LBC 0 : new_len *= 2;
1129 tgl 399 UIC 0 : new = repalloc(new, new_len);
400 0 : out = new + out_dist;
401 : }
402 :
1182 tgl 403 GIC 590 : if (in[0] == escape)
404 : {
1129 tgl 405 ECB : /*
406 : * Any errors reported while processing this escape sequence will
407 : * have an error cursor pointing at the escape.
408 : */
1129 tgl 409 GIC 202 : setup_scanner_errposition_callback(&scbstate, yyscanner,
410 202 : in - str + position + 3); /* 3 for U&" */
1182 411 202 : if (in[1] == escape)
1182 tgl 412 ECB : {
1182 tgl 413 CBC 6 : if (pair_first)
1182 tgl 414 GIC 3 : goto invalid_pair;
1182 tgl 415 CBC 3 : *out++ = escape;
416 3 : in += 2;
1182 tgl 417 ECB : }
1182 tgl 418 GIC 196 : else if (isxdigit((unsigned char) in[1]) &&
419 176 : isxdigit((unsigned char) in[2]) &&
1182 tgl 420 CBC 176 : isxdigit((unsigned char) in[3]) &&
1182 tgl 421 GIC 176 : isxdigit((unsigned char) in[4]))
1182 tgl 422 CBC 170 : {
423 : pg_wchar unicode;
1182 tgl 424 EUB :
1182 tgl 425 GBC 173 : unicode = (hexval(in[1]) << 12) +
426 173 : (hexval(in[2]) << 8) +
1182 tgl 427 GIC 173 : (hexval(in[3]) << 4) +
428 173 : hexval(in[4]);
1129 tgl 429 CBC 173 : check_unicode_value(unicode);
1182 tgl 430 GIC 173 : if (pair_first)
431 : {
432 3 : if (is_utf16_surrogate_second(unicode))
433 : {
1182 tgl 434 UIC 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1182 tgl 435 LBC 0 : pair_first = 0;
1182 tgl 436 ECB : }
437 : else
1182 tgl 438 GIC 3 : goto invalid_pair;
1182 tgl 439 ECB : }
1182 tgl 440 CBC 170 : else if (is_utf16_surrogate_second(unicode))
1182 tgl 441 LBC 0 : goto invalid_pair;
1182 tgl 442 ECB :
1182 tgl 443 GIC 170 : if (is_utf16_surrogate_first(unicode))
1182 tgl 444 CBC 12 : pair_first = unicode;
1182 tgl 445 ECB : else
446 : {
1129 tgl 447 CBC 158 : pg_unicode_to_server(unicode, (unsigned char *) out);
448 158 : out += strlen(out);
449 : }
1182 tgl 450 GIC 170 : in += 5;
1182 tgl 451 ECB : }
1182 tgl 452 CBC 23 : else if (in[1] == '+' &&
453 20 : isxdigit((unsigned char) in[2]) &&
454 20 : isxdigit((unsigned char) in[3]) &&
455 20 : isxdigit((unsigned char) in[4]) &&
456 20 : isxdigit((unsigned char) in[5]) &&
1182 tgl 457 GIC 20 : isxdigit((unsigned char) in[6]) &&
1182 tgl 458 CBC 17 : isxdigit((unsigned char) in[7]))
1182 tgl 459 GIC 11 : {
1182 tgl 460 EUB : pg_wchar unicode;
461 :
1182 tgl 462 GIC 17 : unicode = (hexval(in[2]) << 20) +
463 17 : (hexval(in[3]) << 16) +
1182 tgl 464 CBC 17 : (hexval(in[4]) << 12) +
1182 tgl 465 GIC 17 : (hexval(in[5]) << 8) +
1182 tgl 466 CBC 17 : (hexval(in[6]) << 4) +
1182 tgl 467 GBC 17 : hexval(in[7]);
1129 tgl 468 GIC 17 : check_unicode_value(unicode);
1182 tgl 469 CBC 14 : if (pair_first)
1182 tgl 470 ECB : {
1182 tgl 471 GIC 3 : if (is_utf16_surrogate_second(unicode))
472 : {
1182 tgl 473 LBC 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
474 0 : pair_first = 0;
475 : }
1182 tgl 476 ECB : else
1182 tgl 477 GIC 3 : goto invalid_pair;
1182 tgl 478 ECB : }
1182 tgl 479 CBC 11 : else if (is_utf16_surrogate_second(unicode))
1182 tgl 480 LBC 0 : goto invalid_pair;
1182 tgl 481 ECB :
1182 tgl 482 CBC 11 : if (is_utf16_surrogate_first(unicode))
483 3 : pair_first = unicode;
1182 tgl 484 ECB : else
485 : {
1129 tgl 486 GIC 8 : pg_unicode_to_server(unicode, (unsigned char *) out);
487 8 : out += strlen(out);
1182 tgl 488 ECB : }
1182 tgl 489 CBC 11 : in += 8;
1182 tgl 490 ECB : }
491 : else
1182 tgl 492 CBC 6 : ereport(ERROR,
1182 tgl 493 ECB : (errcode(ERRCODE_SYNTAX_ERROR),
1129 494 : errmsg("invalid Unicode escape"),
495 : errhint("Unicode escapes must be \\XXXX or \\+XXXXXX.")));
496 :
1129 tgl 497 CBC 184 : cancel_scanner_errposition_callback(&scbstate);
498 : }
1182 tgl 499 EUB : else
500 : {
1182 tgl 501 GIC 388 : if (pair_first)
502 3 : goto invalid_pair;
1182 tgl 503 ECB :
1182 tgl 504 GIC 385 : *out++ = *in++;
1182 tgl 505 ECB : }
1182 tgl 506 EUB : }
507 :
1182 tgl 508 ECB : /* unfinished surrogate pair? */
1182 tgl 509 CBC 106 : if (pair_first)
1182 tgl 510 GIC 3 : goto invalid_pair;
511 :
1182 tgl 512 CBC 103 : *out = '\0';
1129 513 103 : return new;
514 :
1182 tgl 515 ECB : /*
516 : * We might get here with the error callback active, or not. Call
517 : * scanner_errposition to make sure an error cursor appears; if the
1129 518 : * callback is active, this is duplicative but harmless.
519 : */
1182 tgl 520 GIC 15 : invalid_pair:
521 15 : ereport(ERROR,
522 : (errcode(ERRCODE_SYNTAX_ERROR),
1182 tgl 523 ECB : errmsg("invalid Unicode surrogate pair"),
524 : scanner_errposition(in - str + position + 3, /* 3 for U&" */
525 : yyscanner)));
526 : return NULL; /* keep compiler quiet */
527 : }
|