Age Owner TLA Line data Source code
1 : %top{
2 : /*-------------------------------------------------------------------------
3 : *
4 : * scan.l
5 : * lexical scanner for PostgreSQL
6 : *
7 : * NOTE NOTE NOTE:
8 : *
9 : * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l
10 : * and src/interfaces/ecpg/preproc/pgc.l!
11 : *
12 : * The rules are designed so that the scanner never has to backtrack,
13 : * in the sense that there is always a rule that can match the input
14 : * consumed so far (the rule action may internally throw back some input
15 : * with yyless(), however). As explained in the flex manual, this makes
16 : * for a useful speed increase --- several percent faster when measuring
17 : * raw parsing (Flex + Bison). The extra complexity is mostly in the rules
18 : * for handling float numbers and continued string literals. If you change
19 : * the lexical rules, verify that you haven't broken the no-backtrack
20 : * property by running flex with the "-b" option and checking that the
21 : * resulting "lex.backup" file says that no backing up is needed. (As of
22 : * Postgres 9.2, this check is made automatically by the Makefile.)
23 : *
24 : *
25 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
26 : * Portions Copyright (c) 1994, Regents of the University of California
27 : *
28 : * IDENTIFICATION
29 : * src/backend/parser/scan.l
30 : *
31 : *-------------------------------------------------------------------------
32 : */
33 : #include "postgres.h"
34 :
35 : #include <ctype.h>
36 : #include <unistd.h>
37 :
38 : #include "common/string.h"
39 : #include "gramparse.h"
40 : #include "nodes/miscnodes.h"
41 : #include "parser/parser.h" /* only needed for GUC variables */
42 : #include "parser/scansup.h"
43 : #include "port/pg_bitutils.h"
44 : #include "mb/pg_wchar.h"
45 : #include "utils/builtins.h"
46 : }
47 :
48 : %{
49 :
50 : /* LCOV_EXCL_START */
51 :
52 : /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
53 : #undef fprintf
54 : #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
55 :
56 : static void
3738 tgl 57 UIC 0 : fprintf_to_ereport(const char *fmt, const char *msg)
58 : {
3738 tgl 59 UBC 0 : ereport(ERROR, (errmsg_internal("%s", msg)));
60 : }
8100 peter_e 61 EUB :
62 : /*
63 : * GUC variables. This is a DIRECT violation of the warning given at the
64 : * head of gram.y, ie flex/bison code must not depend on any GUC variables;
65 : * as such, changing their values can induce very unintuitive behavior.
66 : * But we shall have to live with it until we can remove these variables.
67 : */
68 : int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
69 : bool escape_string_warning = true;
70 : bool standard_conforming_strings = true;
71 :
72 : /*
73 : * Constant data exported from this file. This array maps from the
74 : * zero-based keyword numbers returned by ScanKeywordLookup to the
75 : * Bison token numbers needed by gram.y. This is exported because
76 : * callers need to pass it to scanner_init, if they are using the
77 : * standard keyword list ScanKeywords.
78 : */
79 : #define PG_KEYWORD(kwname, value, category, collabel) value,
80 :
81 : const uint16 ScanKeywordTokens[] = {
82 : #include "parser/kwlist.h"
83 : };
84 :
85 : #undef PG_KEYWORD
86 :
87 : /*
88 : * Set the type of YYSTYPE.
89 : */
90 : #define YYSTYPE core_YYSTYPE
91 :
92 : /*
93 : * Set the type of yyextra. All state variables used by the scanner should
94 : * be in yyextra, *not* statically allocated.
95 : */
96 : #define YY_EXTRA_TYPE core_yy_extra_type *
97 :
98 : /*
99 : * Each call to yylex must set yylloc to the location of the found token
100 : * (expressed as a byte offset from the start of the input text).
101 : * When we parse a token that requires multiple lexer rules to process,
102 : * this should be done in the first such rule, else yylloc will point
103 : * into the middle of the token.
104 : */
105 : #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
106 :
107 : /*
108 : * Advance yylloc by the given number of bytes.
109 : */
110 : #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
111 :
112 : /*
113 : * Sometimes, we do want yylloc to point into the middle of a token; this is
114 : * useful for instance to throw an error about an escape sequence within a
115 : * string literal. But if we find no error there, we want to revert yylloc
116 : * to the token start, so that that's the location reported to the parser.
117 : * Use PUSH_YYLLOC/POP_YYLLOC to save/restore yylloc around such code.
118 : * (Currently the implied "stack" is just one location, but someday we might
119 : * need to nest these.)
120 : */
121 : #define PUSH_YYLLOC() (yyextra->save_yylloc = *(yylloc))
122 : #define POP_YYLLOC() (*(yylloc) = yyextra->save_yylloc)
123 :
124 : #define startlit() ( yyextra->literallen = 0 )
125 : static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
126 : static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
127 : static char *litbufdup(core_yyscan_t yyscanner);
128 : static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
129 : static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
130 : static void addunicode(pg_wchar c, yyscan_t yyscanner);
131 :
132 : #define yyerror(msg) scanner_yyerror(msg, yyscanner)
133 :
134 : #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
135 :
136 : static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
137 : static void check_escape_warning(core_yyscan_t yyscanner);
138 :
139 : /*
140 : * Work around a bug in flex 2.5.35: it emits a couple of functions that
141 : * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
142 : * this would cause warnings. Providing our own declarations should be
143 : * harmless even when the bug gets fixed.
144 : */
145 : extern int core_yyget_column(yyscan_t yyscanner);
146 : extern void core_yyset_column(int column_no, yyscan_t yyscanner);
147 :
148 : %}
149 :
150 : %option reentrant
151 : %option bison-bridge
152 : %option bison-locations
153 : %option 8bit
154 : %option never-interactive
155 : %option nodefault
156 : %option noinput
157 : %option nounput
158 : %option noyywrap
159 : %option noyyalloc
160 : %option noyyrealloc
161 : %option noyyfree
162 : %option warn
163 : %option prefix="core_yy"
164 :
165 : /*
166 : * OK, here is a short description of lex/flex rules behavior.
167 : * The longest pattern which matches an input string is always chosen.
168 : * For equal-length patterns, the first occurring in the rules list is chosen.
169 : * INITIAL is the starting state, to which all non-conditional rules apply.
170 : * Exclusive states change parsing rules while the state is active. When in
171 : * an exclusive state, only those rules defined for that state apply.
172 : *
173 : * We use exclusive states for quoted strings, extended comments,
174 : * and to eliminate parsing troubles for numeric strings.
175 : * Exclusive states:
176 : * <xb> bit string literal
177 : * <xc> extended C-style comments
178 : * <xd> delimited identifiers (double-quoted identifiers)
179 : * <xh> hexadecimal byte string
180 : * <xq> standard quoted strings
181 : * <xqs> quote stop (detect continued strings)
182 : * <xe> extended quoted strings (support backslash escape sequences)
183 : * <xdolq> $foo$ quoted strings
184 : * <xui> quoted identifier with Unicode escapes
185 : * <xus> quoted string with Unicode escapes
186 : * <xeu> Unicode surrogate pair in extended quoted string
187 : *
188 : * Remember to add an <<EOF>> case whenever you add a new exclusive state!
189 : * The default one is probably not the right thing.
190 : */
191 :
192 : %x xb
193 : %x xc
194 : %x xd
195 : %x xh
196 : %x xq
197 : %x xqs
198 : %x xe
199 : %x xdolq
200 : %x xui
201 : %x xus
202 : %x xeu
203 :
204 : /*
205 : * In order to make the world safe for Windows and Mac clients as well as
206 : * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
207 : * sequence will be seen as two successive newlines, but that doesn't cause
208 : * any problems. Comments that start with -- and extend to the next
209 : * newline are treated as equivalent to a single whitespace character.
210 : *
211 : * NOTE a fine point: if there is no newline following --, we will absorb
212 : * everything to the end of the input as a comment. This is correct. Older
213 : * versions of Postgres failed to recognize -- as a comment if the input
214 : * did not end with a newline.
215 : *
216 : * XXX perhaps \f (formfeed) should be treated as a newline as well?
217 : *
218 : * XXX if you change the set of whitespace characters, fix scanner_isspace()
219 : * to agree.
220 : */
221 :
222 : space [ \t\n\r\f]
223 : horiz_space [ \t\f]
224 : newline [\n\r]
225 : non_newline [^\n\r]
226 :
227 : comment ("--"{non_newline}*)
228 :
229 : whitespace ({space}+|{comment})
230 :
231 : /*
232 : * SQL requires at least one newline in the whitespace separating
233 : * string literals that are to be concatenated. Silly, but who are we
234 : * to argue? Note that {whitespace_with_newline} should not have * after
235 : * it, whereas {whitespace} should generally have a * after it...
236 : */
237 :
238 : special_whitespace ({space}+|{comment}{newline})
239 : horiz_whitespace ({horiz_space}|{comment})
240 : whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
241 :
242 : quote '
243 : /* If we see {quote} then {quotecontinue}, the quoted string continues */
244 : quotecontinue {whitespace_with_newline}{quote}
245 :
246 : /*
247 : * {quotecontinuefail} is needed to avoid lexer backup when we fail to match
248 : * {quotecontinue}. It might seem that this could just be {whitespace}*,
249 : * but if there's a dash after {whitespace_with_newline}, it must be consumed
250 : * to see if there's another dash --- which would start a {comment} and thus
251 : * allow continuation of the {quotecontinue} token.
252 : */
253 : quotecontinuefail {whitespace}*"-"?
254 :
255 : /* Bit string
256 : * It is tempting to scan the string for only those characters
257 : * which are allowed. However, this leads to silently swallowed
258 : * characters if illegal characters are included in the string.
259 : * For example, if xbinside is [01] then B'ABCD' is interpreted
260 : * as a zero-length string, and the ABCD' is lost!
261 : * Better to pass the string forward and let the input routines
262 : * validate the contents.
263 : */
264 : xbstart [bB]{quote}
265 : xbinside [^']*
266 :
267 : /* Hexadecimal byte string */
268 : xhstart [xX]{quote}
269 : xhinside [^']*
270 :
271 : /* National character */
272 : xnstart [nN]{quote}
273 :
274 : /* Quoted string that allows backslash escapes */
275 : xestart [eE]{quote}
276 : xeinside [^\\']+
277 : xeescape [\\][^0-7]
278 : xeoctesc [\\][0-7]{1,3}
279 : xehexesc [\\]x[0-9A-Fa-f]{1,2}
280 : xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
281 : xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
282 :
283 : /* Extended quote
284 : * xqdouble implements embedded quote, ''''
285 : */
286 : xqstart {quote}
287 : xqdouble {quote}{quote}
288 : xqinside [^']+
289 :
290 : /* $foo$ style quotes ("dollar quoting")
291 : * The quoted string starts with $foo$ where "foo" is an optional string
292 : * in the form of an identifier, except that it may not contain "$",
293 : * and extends to the first occurrence of an identical string.
294 : * There is *no* processing of the quoted text.
295 : *
296 : * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
297 : * fails to match its trailing "$".
298 : */
299 : dolq_start [A-Za-z\200-\377_]
300 : dolq_cont [A-Za-z\200-\377_0-9]
301 : dolqdelim \$({dolq_start}{dolq_cont}*)?\$
302 : dolqfailed \${dolq_start}{dolq_cont}*
303 : dolqinside [^$]+
304 :
305 : /* Double quote
306 : * Allows embedded spaces and other special characters into identifiers.
307 : */
308 : dquote \"
309 : xdstart {dquote}
310 : xdstop {dquote}
311 : xddouble {dquote}{dquote}
312 : xdinside [^"]+
313 :
314 : /* Quoted identifier with Unicode escapes */
315 : xuistart [uU]&{dquote}
316 :
317 : /* Quoted string with Unicode escapes */
318 : xusstart [uU]&{quote}
319 :
320 : /* error rule to avoid backup */
321 : xufailed [uU]&
322 :
323 :
324 : /* C-style comments
325 : *
326 : * The "extended comment" syntax closely resembles allowable operator syntax.
327 : * The tricky part here is to get lex to recognize a string starting with
328 : * slash-star as a comment, when interpreting it as an operator would produce
329 : * a longer match --- remember lex will prefer a longer match! Also, if we
330 : * have something like plus-slash-star, lex will think this is a 3-character
331 : * operator whereas we want to see it as a + operator and a comment start.
332 : * The solution is two-fold:
333 : * 1. append {op_chars}* to xcstart so that it matches as much text as
334 : * {operator} would. Then the tie-breaker (first matching rule of same
335 : * length) ensures xcstart wins. We put back the extra stuff with yyless()
336 : * in case it contains a star-slash that should terminate the comment.
337 : * 2. In the operator rule, check for slash-star within the operator, and
338 : * if found throw it back with yyless(). This handles the plus-slash-star
339 : * problem.
340 : * Dash-dash comments have similar interactions with the operator rule.
341 : */
342 : xcstart \/\*{op_chars}*
343 : xcstop \*+\/
344 : xcinside [^*/]+
345 :
346 : ident_start [A-Za-z\200-\377_]
347 : ident_cont [A-Za-z\200-\377_0-9\$]
348 :
349 : identifier {ident_start}{ident_cont}*
350 :
351 : /* Assorted special-case operators and operator-like tokens */
352 : typecast "::"
353 : dot_dot \.\.
354 : colon_equals ":="
355 :
356 : /*
357 : * These operator-like tokens (unlike the above ones) also match the {operator}
358 : * rule, which means that they might be overridden by a longer match if they
359 : * are followed by a comment start or a + or - character. Accordingly, if you
360 : * add to this list, you must also add corresponding code to the {operator}
361 : * block to return the correct token in such cases. (This is not needed in
362 : * psqlscan.l since the token value is ignored there.)
363 : */
364 : equals_greater "=>"
365 : less_equals "<="
366 : greater_equals ">="
367 : less_greater "<>"
368 : not_equals "!="
369 :
370 : /*
371 : * "self" is the set of chars that should be returned as single-character
372 : * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
373 : * which can be one or more characters long (but if a single-char token
374 : * appears in the "self" set, it is not to be returned as an Op). Note
375 : * that the sets overlap, but each has some chars that are not in the other.
376 : *
377 : * If you change either set, adjust the character lists appearing in the
378 : * rule for "operator"!
379 : */
380 : self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
381 : op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
382 : operator {op_chars}+
383 :
384 : /*
385 : * Numbers
386 : *
387 : * Unary minus is not part of a number here. Instead we pass it separately to
388 : * the parser, and there it gets coerced via doNegate().
389 : *
390 : * {numericfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
391 : *
392 : * {realfail} is added to prevent the need for scanner
393 : * backup when the {real} rule fails to match completely.
394 : */
395 : decdigit [0-9]
396 : hexdigit [0-9A-Fa-f]
397 : octdigit [0-7]
398 : bindigit [0-1]
399 :
400 : decinteger {decdigit}(_?{decdigit})*
401 : hexinteger 0[xX](_?{hexdigit})+
402 : octinteger 0[oO](_?{octdigit})+
403 : bininteger 0[bB](_?{bindigit})+
404 :
405 : hexfail 0[xX]_?
406 : octfail 0[oO]_?
407 : binfail 0[bB]_?
408 :
409 : numeric (({decinteger}\.{decinteger}?)|(\.{decinteger}))
410 : numericfail {decdigit}+\.\.
411 :
412 : real ({decinteger}|{numeric})[Ee][-+]?{decinteger}
413 : realfail ({decinteger}|{numeric})[Ee][-+]
414 :
415 : decinteger_junk {decinteger}{ident_start}
416 : hexinteger_junk {hexinteger}{ident_start}
417 : octinteger_junk {octinteger}{ident_start}
418 : bininteger_junk {bininteger}{ident_start}
419 : numeric_junk {numeric}{ident_start}
420 : real_junk {real}{ident_start}
421 :
422 : param \${decinteger}
423 : param_junk \${decinteger}{ident_start}
424 :
425 : other .
426 :
427 : /*
428 : * Dollar quoted strings are totally opaque, and no escaping is done on them.
429 : * Other quoted strings must allow some special characters such as single-quote
430 : * and newline.
431 : * Embedded single-quotes are implemented both in the SQL standard
432 : * style of two adjacent single quotes "''" and in the Postgres/Java style
433 : * of escaped-quote "\'".
434 : * Other embedded escaped characters are matched explicitly and the leading
435 : * backslash is dropped from the string.
436 : * Note that xcstart must appear before operator, as explained above!
437 : * Also whitespace (comment) must appear before operator.
438 : */
439 :
440 : %%
441 :
442 : {whitespace} {
443 : /* ignore */
444 : }
9770 scrappy 445 GIC 9417322 :
8429 tgl 446 49588 : {xcstart} {
447 : /* Set location in case of syntax error in comment */
6235 448 49588 : SET_YYLLOC();
5018 449 49588 : yyextra->xcdepth = 0;
8429 450 49588 : BEGIN(xc);
451 : /* Put back any characters past slash-star; see above */
452 49588 : yyless(2);
453 : }
9770 scrappy 454 49588 :
455 : <xc>{
1608 tgl 456 9 : {xcstart} {
5018 457 9 : (yyextra->xcdepth)++;
458 : /* Put back any characters past slash-star; see above */
8304 lockhart 459 9 : yyless(2);
460 : }
461 9 :
1608 tgl 462 CBC 49597 : {xcstop} {
5018 463 49597 : if (yyextra->xcdepth <= 0)
8304 lockhart 464 GIC 49588 : BEGIN(INITIAL);
8304 lockhart 465 ECB : else
5018 tgl 466 CBC 9 : (yyextra->xcdepth)--;
8304 lockhart 467 ECB : }
9770 scrappy 468 GIC 49597 :
1608 tgl 469 CBC 296902 : {xcinside} {
470 : /* ignore */
6989 tgl 471 ECB : }
9351 lockhart 472 GIC 296902 :
1608 tgl 473 CBC 247325 : {op_chars} {
6989 tgl 474 ECB : /* ignore */
475 : }
8304 lockhart 476 CBC 247325 :
1608 tgl 477 GIC 6 : \*+ {
6527 tgl 478 ECB : /* ignore */
479 : }
6527 tgl 480 CBC 6 :
1608 tgl 481 LBC 0 : <<EOF>> {
1608 tgl 482 UIC 0 : yyerror("unterminated /* comment");
1608 tgl 483 ECB : }
484 : } /* <xc> */
8429 485 :
7607 lockhart 486 CBC 402 : {xbstart} {
487 : /* Binary bit type.
488 : * At some point we should simply pass the string
7553 lockhart 489 ECB : * forward to the parser and label it there.
490 : * In the meantime, place a leading "b" on the string
491 : * to mark it for the input routine as a binary string.
492 : */
6235 tgl 493 CBC 402 : SET_YYLLOC();
7607 lockhart 494 402 : BEGIN(xb);
8574 tgl 495 GIC 402 : startlit();
5018 496 402 : addlitchar('b', yyscanner);
9274 lockhart 497 ECB : }
9274 lockhart 498 GBC 402 : <xh>{xhinside} |
7607 499 2559 : <xb>{xbinside} {
5018 tgl 500 GIC 2559 : addlit(yytext, yyleng, yyscanner);
501 : }
6989 502 2559 : <xb><<EOF>> { yyerror("unterminated bit string literal"); }
6989 tgl 503 LBC 0 :
9274 lockhart 504 GIC 2173 : {xhstart} {
505 : /* Hexadecimal bit type.
506 : * At some point we should simply pass the string
507 : * forward to the parser and label it there.
508 : * In the meantime, place a leading "x" on the string
509 : * to mark it for the input routine as a hex string.
7596 lockhart 510 ECB : */
6235 tgl 511 CBC 2173 : SET_YYLLOC();
9274 lockhart 512 2173 : BEGIN(xh);
8574 tgl 513 2173 : startlit();
5018 tgl 514 GIC 2173 : addlitchar('x', yyscanner);
9274 lockhart 515 ECB : }
7553 lockhart 516 CBC 2173 : <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
9274 lockhart 517 LBC 0 :
7596 lockhart 518 GIC 1 : {xnstart} {
7596 lockhart 519 ECB : /* National character.
7553 lockhart 520 EUB : * We will pass this along as a normal character string,
7553 lockhart 521 ECB : * but preceded with an internally-generated "NCHAR".
522 : */
523 : int kwnum;
524 :
6235 tgl 525 GIC 1 : SET_YYLLOC();
2577 526 1 : yyless(1); /* eat only 'n' this time */
527 :
1554 tgl 528 CBC 1 : kwnum = ScanKeywordLookup("nchar",
529 1 : yyextra->keywordlist);
530 1 : if (kwnum >= 0)
5017 tgl 531 ECB : {
1554 tgl 532 GIC 2 : yylval->keyword = GetScanKeyword(kwnum,
1554 tgl 533 CBC 1 : yyextra->keywordlist);
1554 tgl 534 GBC 1 : return yyextra->keyword_tokens[kwnum];
5017 tgl 535 ECB : }
536 : else
537 : {
538 : /* If NCHAR isn't a keyword, just return "n" */
5017 tgl 539 UIC 0 : yylval->str = pstrdup("n");
540 0 : return IDENT;
541 : }
7596 lockhart 542 ECB : }
543 :
9351 lockhart 544 GIC 549067 : {xqstart} {
5018 tgl 545 CBC 549067 : yyextra->warn_on_first_escape = true;
546 549067 : yyextra->saw_non_ascii = false;
6235 547 549067 : SET_YYLLOC();
2999 tgl 548 GIC 549067 : if (yyextra->standard_conforming_strings)
6243 bruce 549 CBC 549012 : BEGIN(xq);
6243 bruce 550 ECB : else
6243 bruce 551 CBC 55 : BEGIN(xe);
6496 bruce 552 GIC 549067 : startlit();
553 : }
554 549067 : {xestart} {
5018 tgl 555 3318 : yyextra->warn_on_first_escape = false;
5018 tgl 556 GBC 3318 : yyextra->saw_non_ascii = false;
6235 557 3318 : SET_YYLLOC();
6243 bruce 558 GIC 3318 : BEGIN(xe);
8574 tgl 559 3318 : startlit();
560 : }
5275 peter_e 561 CBC 3318 : {xusstart} {
5087 tgl 562 152 : SET_YYLLOC();
2999 563 152 : if (!yyextra->standard_conforming_strings)
5087 peter_e 564 18 : ereport(ERROR,
5087 peter_e 565 ECB : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
566 : errmsg("unsafe use of string constant with Unicode escapes"),
567 : errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
tgl 568 : lexer_errposition()));
5275 peter_e 569 CBC 134 : BEGIN(xus);
5275 peter_e 570 GIC 134 : startlit();
5275 peter_e 571 ECB : }
1182 tgl 572 CBC 134 :
573 555088 : <xb,xh,xq,xe,xus>{quote} {
5103 tgl 574 ECB : /*
1182 575 : * When we are scanning a quoted string and see an end
576 : * quote, we must look ahead for a possible continuation.
577 : * If we don't see one, we know the end quote was in fact
578 : * the end of the string. To reduce the lexer table size,
579 : * we use a single "xqs" state to do the lookahead for all
580 : * types of strings.
5688 andrew 581 : */
1182 tgl 582 GIC 555088 : yyextra->state_before_str_stop = YYSTATE;
583 555088 : BEGIN(xqs);
584 : }
585 555088 : <xqs>{quotecontinue} {
1182 tgl 586 CBC 18 : /*
1182 tgl 587 ECB : * Found a quote continuation, so return to the in-quote
588 : * state and continue scanning the literal. Nothing is
589 : * added to the literal's contents.
590 : */
1182 tgl 591 GIC 18 : BEGIN(yyextra->state_before_str_stop);
592 : }
593 18 : <xqs>{quotecontinuefail} |
594 555070 : <xqs>{other} |
595 : <xqs><<EOF>> {
596 : /*
597 : * Failed to see a quote continuation. Throw back
598 : * everything after the end quote, and handle the string
1182 tgl 599 ECB : * according to the state we were in previously.
600 : */
3678 heikki.linnakangas 601 GIC 555070 : yyless(0);
5275 peter_e 602 CBC 555070 : BEGIN(INITIAL);
1182 tgl 603 ECB :
1182 tgl 604 GIC 555070 : switch (yyextra->state_before_str_stop)
605 : {
606 402 : case xb:
607 402 : yylval->str = litbufdup(yyscanner);
1182 tgl 608 CBC 402 : return BCONST;
1182 tgl 609 GIC 2173 : case xh:
1182 tgl 610 CBC 2173 : yylval->str = litbufdup(yyscanner);
611 2173 : return XCONST;
1182 tgl 612 GIC 552361 : case xq:
613 : case xe:
614 : /*
615 : * Check that the data remains valid, if it might
616 : * have been made invalid by unescaping any chars.
617 : */
1182 tgl 618 CBC 552361 : if (yyextra->saw_non_ascii)
619 3 : pg_verifymbstr(yyextra->literalbuf,
1182 tgl 620 GIC 3 : yyextra->literallen,
1182 tgl 621 ECB : false);
1182 tgl 622 GIC 552361 : yylval->str = litbufdup(yyscanner);
1182 tgl 623 CBC 552361 : return SCONST;
624 134 : case xus:
625 134 : yylval->str = litbufdup(yyscanner);
626 134 : return USCONST;
1182 tgl 627 LBC 0 : default:
628 0 : yyerror("unhandled previous state in xqs");
3678 heikki.linnakangas 629 ECB : }
630 : }
631 :
5275 peter_e 632 GIC 3368 : <xq,xe,xus>{xqdouble} {
5018 tgl 633 3368 : addlitchar('\'', yyscanner);
634 : }
5275 peter_e 635 CBC 3368 : <xq,xus>{xqinside} {
5018 tgl 636 538081 : addlit(yytext, yyleng, yyscanner);
9344 bruce 637 ECB : }
6243 bruce 638 GIC 538081 : <xe>{xeinside} {
5018 tgl 639 CBC 5839 : addlit(yytext, yyleng, yyscanner);
6243 bruce 640 ECB : }
4947 peter_e 641 CBC 5839 : <xe>{xeunicode} {
2577 tgl 642 72 : pg_wchar c = strtoul(yytext + 2, NULL, 16);
4947 peter_e 643 ECB :
1129 tgl 644 EUB : /*
645 : * For consistency with other productions, issue any
646 : * escape warning with cursor pointing to start of string.
647 : * We might want to change that, someday.
648 : */
4947 peter_e 649 CBC 72 : check_escape_warning(yyscanner);
4947 peter_e 650 ECB :
651 : /* Remember start of overall string token ... */
1129 tgl 652 CBC 72 : PUSH_YYLLOC();
1129 tgl 653 ECB : /* ... and set the error cursor to point at this esc seq */
1129 tgl 654 GIC 72 : SET_YYLLOC();
1129 tgl 655 ECB :
4947 peter_e 656 CBC 72 : if (is_utf16_surrogate_first(c))
657 : {
658 15 : yyextra->utf16_first_part = c;
659 15 : BEGIN(xeu);
660 : }
4947 peter_e 661 GIC 57 : else if (is_utf16_surrogate_second(c))
4947 peter_e 662 UIC 0 : yyerror("invalid Unicode surrogate pair");
663 : else
4947 peter_e 664 GIC 57 : addunicode(c, yyscanner);
665 :
1129 tgl 666 ECB : /* Restore yylloc to be start of string token */
1129 tgl 667 GIC 69 : POP_YYLLOC();
668 : }
4947 peter_e 669 CBC 69 : <xeu>{xeunicode} {
2577 tgl 670 GIC 6 : pg_wchar c = strtoul(yytext + 2, NULL, 16);
4947 peter_e 671 ECB :
672 : /* Remember start of overall string token ... */
1129 tgl 673 CBC 6 : PUSH_YYLLOC();
674 : /* ... and set the error cursor to point at this esc seq */
675 6 : SET_YYLLOC();
1129 tgl 676 ECB :
4947 peter_e 677 GIC 6 : if (!is_utf16_surrogate_second(c))
4947 peter_e 678 CBC 6 : yyerror("invalid Unicode surrogate pair");
4947 peter_e 679 EUB :
4947 peter_e 680 UIC 0 : c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
4947 peter_e 681 ECB :
4947 peter_e 682 UIC 0 : addunicode(c, yyscanner);
683 :
1129 tgl 684 ECB : /* Restore yylloc to be start of string token */
1129 tgl 685 UIC 0 : POP_YYLLOC();
1129 tgl 686 ECB :
4947 peter_e 687 LBC 0 : BEGIN(xe);
688 : }
1129 tgl 689 UIC 0 : <xeu>. |
1129 tgl 690 CBC 9 : <xeu>\n |
691 : <xeu><<EOF>> {
1129 tgl 692 ECB : /* Set the error cursor to point at missing esc seq */
1129 tgl 693 GIC 9 : SET_YYLLOC();
1129 tgl 694 CBC 9 : yyerror("invalid Unicode surrogate pair");
1129 tgl 695 ECB : }
696 : <xe,xeu>{xeunicodefail} {
1129 tgl 697 GBC 6 : /* Set the error cursor to point at malformed esc seq */
1129 tgl 698 GIC 6 : SET_YYLLOC();
2577 tgl 699 GBC 6 : ereport(ERROR,
700 : (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
701 : errmsg("invalid Unicode escape"),
2577 tgl 702 EUB : errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
703 : lexer_errposition()));
4942 704 : }
705 : <xe>{xeescape} {
6167 tgl 706 GBC 3455 : if (yytext[1] == '\'')
6167 tgl 707 ECB : {
2999 tgl 708 GIC 18 : if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
709 36 : (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
6167 tgl 710 CBC 18 : PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
6167 tgl 711 LBC 0 : ereport(ERROR,
712 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
713 : errmsg("unsafe use of \\' in a string literal"),
6167 tgl 714 ECB : errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
715 : lexer_errposition()));
716 : }
5018 tgl 717 GIC 3455 : check_string_escape_warning(yytext[1], yyscanner);
718 3455 : addlitchar(unescape_single_char(yytext[1], yyscanner),
719 : yyscanner);
720 : }
6243 bruce 721 3455 : <xe>{xeoctesc} {
2577 tgl 722 48 : unsigned char c = strtoul(yytext + 1, NULL, 8);
6496 bruce 723 ECB :
5018 tgl 724 GIC 48 : check_escape_warning(yyscanner);
5018 tgl 725 CBC 48 : addlitchar(c, yyscanner);
5103 726 48 : if (c == '\0' || IS_HIGHBIT_SET(c))
5018 tgl 727 LBC 0 : yyextra->saw_non_ascii = true;
7659 peter_e 728 EUB : }
6243 bruce 729 GIC 48 : <xe>{xehexesc} {
2577 tgl 730 6 : unsigned char c = strtoul(yytext + 2, NULL, 16);
731 :
5018 732 6 : check_escape_warning(yyscanner);
733 6 : addlitchar(c, yyscanner);
5103 tgl 734 CBC 6 : if (c == '\0' || IS_HIGHBIT_SET(c))
5018 735 5 : yyextra->saw_non_ascii = true;
736 : }
6243 bruce 737 GIC 6 : <xe>. {
6984 tgl 738 LBC 0 : /* This is only needed for \ just before EOF */
5018 739 0 : addlitchar(yytext[0], yyscanner);
740 : }
5275 peter_e 741 0 : <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
9344 bruce 742 0 :
6984 tgl 743 CBC 4354 : {dolqdelim} {
6235 tgl 744 GBC 4354 : SET_YYLLOC();
5018 tgl 745 GIC 4354 : yyextra->dolqstart = pstrdup(yytext);
6984 tgl 746 CBC 4354 : BEGIN(xdolq);
747 4354 : startlit();
748 : }
6527 749 4354 : {dolqfailed} {
5108 tgl 750 LBC 0 : SET_YYLLOC();
6527 tgl 751 ECB : /* throw back all but the initial "$" */
6527 tgl 752 LBC 0 : yyless(1);
753 : /* and treat it as {other} */
754 0 : return yytext[0];
6527 tgl 755 EUB : }
6984 756 : <xdolq>{dolqdelim} {
5018 tgl 757 GIC 4573 : if (strcmp(yytext, yyextra->dolqstart) == 0)
6984 tgl 758 EUB : {
5018 tgl 759 GBC 4354 : pfree(yyextra->dolqstart);
5018 tgl 760 CBC 4354 : yyextra->dolqstart = NULL;
6984 761 4354 : BEGIN(INITIAL);
5018 762 4354 : yylval->str = litbufdup(yyscanner);
6984 763 4354 : return SCONST;
6984 tgl 764 ECB : }
765 : else
766 : {
6984 tgl 767 EUB : /*
768 : * When we fail to match $...$ to dolqstart, transfer
769 : * the $... part to the output, but put back the final
770 : * $ for rescanning. Consider $delim$...$junk$delim$
771 : */
2577 tgl 772 GIC 219 : addlit(yytext, yyleng - 1, yyscanner);
773 219 : yyless(yyleng - 1);
6984 tgl 774 ECB : }
775 : }
6984 tgl 776 CBC 219 : <xdolq>{dolqinside} {
5018 777 6505 : addlit(yytext, yyleng, yyscanner);
6984 tgl 778 ECB : }
6527 tgl 779 CBC 6505 : <xdolq>{dolqfailed} {
5018 780 543 : addlit(yytext, yyleng, yyscanner);
781 : }
6984 tgl 782 GIC 543 : <xdolq>. {
783 1608 : /* This is only needed for $ inside the quoted text */
5018 784 1608 : addlitchar(yytext[0], yyscanner);
785 : }
6984 786 1608 : <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
6984 tgl 787 UIC 0 :
9292 lockhart 788 GIC 54002 : {xdstart} {
6235 tgl 789 CBC 54002 : SET_YYLLOC();
9292 lockhart 790 54002 : BEGIN(xd);
8574 tgl 791 GIC 54002 : startlit();
792 : }
5275 peter_e 793 CBC 54002 : {xuistart} {
794 13 : SET_YYLLOC();
5275 peter_e 795 GIC 13 : BEGIN(xui);
5275 peter_e 796 CBC 13 : startlit();
5275 peter_e 797 ECB : }
9292 lockhart 798 GIC 13 : <xd>{xdstop} {
2577 tgl 799 CBC 54002 : char *ident;
6987 tgl 800 ECB :
9292 lockhart 801 CBC 54002 : BEGIN(INITIAL);
5018 tgl 802 GIC 54002 : if (yyextra->literallen == 0)
7648 tgl 803 CBC 3 : yyerror("zero-length delimited identifier");
5018 tgl 804 GBC 53999 : ident = litbufdup(yyscanner);
5018 tgl 805 CBC 53999 : if (yyextra->literallen >= NAMEDATALEN)
5018 tgl 806 LBC 0 : truncate_identifier(ident, yyextra->literallen, true);
5018 tgl 807 CBC 53999 : yylval->str = ident;
8986 bruce 808 53999 : return IDENT;
809 : }
1182 tgl 810 ECB : <xui>{dquote} {
5275 peter_e 811 CBC 13 : BEGIN(INITIAL);
5018 tgl 812 13 : if (yyextra->literallen == 0)
5275 peter_e 813 LBC 0 : yyerror("zero-length delimited identifier");
814 : /* can't truncate till after we de-escape the ident */
1182 tgl 815 CBC 13 : yylval->str = litbufdup(yyscanner);
816 13 : return UIDENT;
817 : }
5275 peter_e 818 ECB : <xd,xui>{xddouble} {
5018 tgl 819 CBC 67 : addlitchar('"', yyscanner);
8281 lockhart 820 ECB : }
5275 peter_e 821 CBC 67 : <xd,xui>{xdinside} {
5018 tgl 822 54075 : addlit(yytext, yyleng, yyscanner);
9292 lockhart 823 EUB : }
5275 peter_e 824 CBC 54075 : <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
5275 peter_e 825 LBC 0 :
5275 peter_e 826 UIC 0 : {xufailed} {
827 : char *ident;
5108 tgl 828 ECB :
5108 tgl 829 LBC 0 : SET_YYLLOC();
5275 peter_e 830 EUB : /* throw back all but the initial u/U */
5275 peter_e 831 UIC 0 : yyless(1);
5108 tgl 832 ECB : /* and treat it as {identifier} */
5108 tgl 833 LBC 0 : ident = downcase_truncate_identifier(yytext, yyleng, true);
5018 tgl 834 UIC 0 : yylval->str = ident;
5108 835 0 : return IDENT;
5275 peter_e 836 ECB : }
837 :
6989 tgl 838 CBC 107486 : {typecast} {
6235 839 107486 : SET_YYLLOC();
6989 tgl 840 GIC 107486 : return TYPECAST;
6989 tgl 841 ECB : }
9344 bruce 842 EUB :
5017 tgl 843 GBC 149 : {dot_dot} {
5017 tgl 844 GIC 149 : SET_YYLLOC();
845 149 : return DOT_DOT;
5017 tgl 846 EUB : }
847 :
5017 tgl 848 GBC 21190 : {colon_equals} {
5017 tgl 849 GIC 21190 : SET_YYLLOC();
5017 tgl 850 GBC 21190 : return COLON_EQUALS;
5017 tgl 851 EUB : }
852 :
2577 tgl 853 GIC 438 : {equals_greater} {
2952 rhaas 854 438 : SET_YYLLOC();
2952 rhaas 855 CBC 438 : return EQUALS_GREATER;
2952 rhaas 856 ECB : }
857 :
2951 tgl 858 GIC 2616 : {less_equals} {
859 2616 : SET_YYLLOC();
2951 tgl 860 CBC 2616 : return LESS_EQUALS;
2951 tgl 861 ECB : }
862 :
2951 tgl 863 GIC 2992 : {greater_equals} {
864 2992 : SET_YYLLOC();
2951 tgl 865 CBC 2992 : return GREATER_EQUALS;
2951 tgl 866 ECB : }
867 :
2951 tgl 868 GIC 10500 : {less_greater} {
869 : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
2951 tgl 870 CBC 10500 : SET_YYLLOC();
871 10500 : return NOT_EQUALS;
2951 tgl 872 ECB : }
873 :
2951 tgl 874 GIC 12428 : {not_equals} {
2951 tgl 875 ECB : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
2951 tgl 876 CBC 12428 : SET_YYLLOC();
877 12428 : return NOT_EQUALS;
878 : }
879 :
6989 880 5708200 : {self} {
6235 881 5708200 : SET_YYLLOC();
6989 882 5708200 : return yytext[0];
883 : }
884 :
9344 bruce 885 42172 : {operator} {
886 : /*
8422 tgl 887 ECB : * Check for embedded slash-star or dash-dash; those
888 : * are comment starts, so operator must stop there.
889 : * Note that slash-star or dash-dash at the first
890 : * character will match a prior rule, not this one.
891 : */
2577 tgl 892 GIC 42172 : int nchars = yyleng;
2577 tgl 893 CBC 42172 : char *slashstar = strstr(yytext, "/*");
894 42172 : char *dashdash = strstr(yytext, "--");
895 :
8429 tgl 896 GIC 42172 : if (slashstar && dashdash)
8429 tgl 897 ECB : {
8422 898 : /* if both appear, take the first one */
8429 tgl 899 LBC 0 : if (slashstar > dashdash)
8429 tgl 900 UIC 0 : slashstar = dashdash;
901 : }
8429 tgl 902 CBC 42172 : else if (!slashstar)
8429 tgl 903 GIC 42137 : slashstar = dashdash;
904 42172 : if (slashstar)
7648 905 54 : nchars = slashstar - yytext;
906 :
907 : /*
908 : * For SQL compatibility, '+' and '-' cannot be the
8422 tgl 909 ECB : * last char of a multi-char operator unless the operator
7122 peter_e 910 : * contains chars that are not in SQL operators.
8422 tgl 911 : * The idea is to lex '=-' as two operators, but not
912 : * to forbid operator names like '?-' that could not be
7122 peter_e 913 : * sequences of SQL operators.
914 : */
1690 rhodiumtoad 915 GIC 42172 : if (nchars > 1 &&
1690 rhodiumtoad 916 GBC 31377 : (yytext[nchars - 1] == '+' ||
917 31373 : yytext[nchars - 1] == '-'))
918 : {
2577 tgl 919 ECB : int ic;
8422 920 :
2577 tgl 921 CBC 1243 : for (ic = nchars - 2; ic >= 0; ic--)
8422 tgl 922 ECB : {
1690 rhodiumtoad 923 GIC 738 : char c = yytext[ic];
924 738 : if (c == '~' || c == '!' || c == '@' ||
925 688 : c == '#' || c == '^' || c == '&' ||
926 550 : c == '|' || c == '`' || c == '?' ||
927 : c == '%')
928 : break;
929 : }
930 703 : if (ic < 0)
931 : {
1690 rhodiumtoad 932 ECB : /*
933 : * didn't find a qualifying character, so remove
934 : * all trailing [+-]
935 : */
936 : do {
1690 rhodiumtoad 937 GIC 505 : nchars--;
1690 rhodiumtoad 938 CBC 505 : } while (nchars > 1 &&
1690 rhodiumtoad 939 GIC 23 : (yytext[nchars - 1] == '+' ||
1690 rhodiumtoad 940 CBC 23 : yytext[nchars - 1] == '-'));
1690 rhodiumtoad 941 ECB : }
8422 tgl 942 : }
943 :
6235 tgl 944 GIC 42172 : SET_YYLLOC();
945 :
8422 946 42172 : if (nchars < yyleng)
8429 tgl 947 ECB : {
948 : /* Strip the unwanted chars from the token */
8429 tgl 949 GIC 559 : yyless(nchars);
950 : /*
951 : * If what we have left is only one char, and it's
952 : * one of the characters matching "self", then
953 : * return it as a character token the same way
8429 tgl 954 ECB : * that the "self" rule would have.
955 : */
8429 tgl 956 CBC 559 : if (nchars == 1 &&
7234 957 482 : strchr(",()[].;:+-*/%^<>=", yytext[0]))
8429 tgl 958 GIC 482 : return yytext[0];
959 : /*
960 : * Likewise, if what we have left is two chars, and
1690 rhodiumtoad 961 ECB : * those match the tokens ">=", "<=", "=>", "<>" or
962 : * "!=", then we must return the appropriate token
963 : * rather than the generic Op.
964 : */
1690 rhodiumtoad 965 GIC 77 : if (nchars == 2)
1690 rhodiumtoad 966 ECB : {
1690 rhodiumtoad 967 GIC 77 : if (yytext[0] == '=' && yytext[1] == '>')
968 23 : return EQUALS_GREATER;
969 54 : if (yytext[0] == '>' && yytext[1] == '=')
970 11 : return GREATER_EQUALS;
971 43 : if (yytext[0] == '<' && yytext[1] == '=')
972 11 : return LESS_EQUALS;
1690 rhodiumtoad 973 CBC 32 : if (yytext[0] == '<' && yytext[1] == '>')
974 14 : return NOT_EQUALS;
975 18 : if (yytext[0] == '!' && yytext[1] == '=')
1690 rhodiumtoad 976 GIC 15 : return NOT_EQUALS;
977 : }
978 : }
979 :
980 : /*
981 : * Complain if operator is too long. Unlike the case
6445 tgl 982 ECB : * for identifiers, we make this an error not a notice-
983 : * and-truncate, because the odds are we are looking at
984 : * a syntactic mistake anyway.
985 : */
6445 tgl 986 CBC 41616 : if (nchars >= NAMEDATALEN)
6445 tgl 987 LBC 0 : yyerror("operator too long");
6445 tgl 988 ECB :
2951 tgl 989 CBC 41616 : yylval->str = pstrdup(yytext);
8986 bruce 990 41616 : return Op;
9344 bruce 991 ECB : }
8450 tgl 992 :
9344 bruce 993 CBC 135850 : {param} {
6235 tgl 994 GIC 135850 : SET_YYLLOC();
5018 995 135850 : yylval->ival = atol(yytext + 1);
8986 bruce 996 135850 : return PARAM;
997 : }
998 : {param_junk} {
417 peter 999 3 : SET_YYLLOC();
1000 3 : yyerror("trailing junk after parameter");
1001 : }
1002 :
116 peter 1003 GNC 304244 : {decinteger} {
116 peter 1004 GBC 304244 : SET_YYLLOC();
116 peter 1005 GNC 304244 : return process_integer_literal(yytext, yylval, 10);
116 peter 1006 ECB : }
1007 : {hexinteger} {
116 peter 1008 GNC 1545 : SET_YYLLOC();
1009 1545 : return process_integer_literal(yytext, yylval, 16);
1010 : }
1011 : {octinteger} {
1012 30 : SET_YYLLOC();
1013 30 : return process_integer_literal(yytext, yylval, 8);
1014 : }
1015 : {bininteger} {
1016 31 : SET_YYLLOC();
1017 31 : return process_integer_literal(yytext, yylval, 2);
1018 : }
1019 : {hexfail} {
1020 3 : SET_YYLLOC();
1021 3 : yyerror("invalid hexadecimal integer");
1022 : }
1023 : {octfail} {
6235 tgl 1024 3 : SET_YYLLOC();
116 peter 1025 3 : yyerror("invalid octal integer");
1026 : }
1027 : {binfail} {
1028 3 : SET_YYLLOC();
1029 3 : yyerror("invalid binary integer");
1030 : }
1031 : {numeric} {
6235 tgl 1032 GIC 5467 : SET_YYLLOC();
5018 1033 5467 : yylval->str = pstrdup(yytext);
8448 tgl 1034 CBC 5467 : return FCONST;
8733 lockhart 1035 ECB : }
1036 : {numericfail} {
4896 tgl 1037 CBC 47 : /* throw back the .., and treat as integer */
2577 tgl 1038 GIC 47 : yyless(yyleng - 2);
4896 1039 47 : SET_YYLLOC();
116 peter 1040 GNC 47 : return process_integer_literal(yytext, yylval, 10);
4896 tgl 1041 ECB : }
1042 : {real} {
6235 tgl 1043 GIC 194 : SET_YYLLOC();
5018 tgl 1044 CBC 194 : yylval->str = pstrdup(yytext);
8986 bruce 1045 194 : return FCONST;
9344 bruce 1046 ECB : }
1047 : {realfail} {
6235 tgl 1048 GIC 3 : SET_YYLLOC();
417 peter 1049 CBC 3 : yyerror("trailing junk after numeric literal");
6527 tgl 1050 ECB : }
1051 : {decinteger_junk} {
116 peter 1052 GIC 21 : SET_YYLLOC();
116 peter 1053 CBC 21 : yyerror("trailing junk after numeric literal");
116 peter 1054 ECB : }
1055 : {hexinteger_junk} {
116 peter 1056 GNC 6 : SET_YYLLOC();
1057 6 : yyerror("trailing junk after numeric literal");
1058 : }
1059 : {octinteger_junk} {
1060 3 : SET_YYLLOC();
1061 3 : yyerror("trailing junk after numeric literal");
1062 : }
1063 : {bininteger_junk} {
6235 tgl 1064 3 : SET_YYLLOC();
417 peter 1065 3 : yyerror("trailing junk after numeric literal");
1066 : }
1067 : {numeric_junk} {
417 peter 1068 GIC 21 : SET_YYLLOC();
417 peter 1069 CBC 21 : yyerror("trailing junk after numeric literal");
417 peter 1070 ECB : }
1071 : {real_junk} {
417 peter 1072 GIC 3 : SET_YYLLOC();
417 peter 1073 CBC 3 : yyerror("trailing junk after numeric literal");
6527 tgl 1074 ECB : }
1075 :
9101 lockhart 1076 GIC 9774191 :
9770 scrappy 1077 ECB : {identifier} {
1554 tgl 1078 : int kwnum;
1079 : char *ident;
1080 :
6235 tgl 1081 CBC 9774191 : SET_YYLLOC();
6235 tgl 1082 ECB :
1083 : /* Is it a keyword? */
1554 tgl 1084 GIC 9774191 : kwnum = ScanKeywordLookup(yytext,
1554 tgl 1085 CBC 9774191 : yyextra->keywordlist);
1086 9774191 : if (kwnum >= 0)
7647 tgl 1087 ECB : {
1554 tgl 1088 GIC 9485952 : yylval->keyword = GetScanKeyword(kwnum,
1089 4742976 : yyextra->keywordlist);
1554 tgl 1090 CBC 4742976 : return yyextra->keyword_tokens[kwnum];
7647 tgl 1091 ECB : }
8082 1092 :
1093 : /*
1094 : * No. Convert the identifier to lower case, and truncate
1095 : * if necessary.
1096 : */
6987 tgl 1097 CBC 5031215 : ident = downcase_truncate_identifier(yytext, yyleng, true);
5018 1098 5031215 : yylval->str = ident;
8082 tgl 1099 GIC 5031215 : return IDENT;
1100 : }
9344 bruce 1101 ECB :
6989 tgl 1102 LBC 0 : {other} {
6235 tgl 1103 UIC 0 : SET_YYLLOC();
6989 1104 0 : return yytext[0];
6989 tgl 1105 ECB : }
9770 scrappy 1106 :
6235 tgl 1107 GIC 501547 : <<EOF>> {
1108 501547 : SET_YYLLOC();
6235 tgl 1109 CBC 501547 : yyterminate();
6235 tgl 1110 ECB : }
1111 :
9770 scrappy 1112 UIC 0 : %%
9770 scrappy 1113 ECB :
2068 peter_e 1114 : /* LCOV_EXCL_STOP */
1115 :
1116 : /*
5018 tgl 1117 : * Arrange access to yyextra for subroutines of the main yylex() function.
1118 : * We expect each subroutine to have a yyscanner parameter. Rather than
1119 : * use the yyget_xxx functions, which might or might not get inlined by the
1120 : * compiler, we cheat just a bit and cast yyscanner to the right type.
1121 : */
1122 : #undef yyextra
1123 : #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1124 :
1125 : /* Likewise for a couple of other things we need. */
1126 : #undef yylloc
1127 : #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1128 : #undef yyleng
2577 1129 : #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1130 :
1131 :
1132 : /*
1133 : * scanner_errposition
5333 1134 : * Report a lexer or grammar error cursor position, if possible.
1135 : *
1136 : * This is expected to be used within an ereport() call, or via an error
1129 1137 : * callback such as setup_scanner_errposition_callback(). The return value
6235 1138 : * is a dummy (always 0, in fact).
1139 : *
1140 : * Note that this can only be used for messages emitted during raw parsing
1129 1141 : * (essentially, scan.l, parser.c, and gram.y), since it requires the
1142 : * yyscanner struct to still be available.
6235 1143 : */
1144 : int
4899 tgl 1145 GIC 572 : scanner_errposition(int location, core_yyscan_t yyscanner)
1146 : {
1147 : int pos;
1148 :
5333 1149 572 : if (location < 0)
1110 tgl 1150 LBC 0 : return 0; /* no-op if location is unknown */
5333 tgl 1151 ECB :
6235 1152 : /* Convert byte offset to character number */
5018 tgl 1153 GIC 572 : pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1154 : /* And pass it to the ereport mechanism */
1110 tgl 1155 GBC 572 : return errposition(pos);
6496 tgl 1156 EUB : }
1157 :
1158 : /*
1159 : * Error context callback for inserting scanner error location.
1129 tgl 1160 ECB : *
1161 : * Note that this will be called for *any* error occurring while the
1162 : * callback is installed. We avoid inserting an irrelevant error location
1163 : * if the error is a query cancel --- are there any other important cases?
1164 : */
1129 tgl 1165 EUB : static void
1129 tgl 1166 GIC 18 : scb_error_callback(void *arg)
1167 : {
1168 18 : ScannerCallbackState *scbstate = (ScannerCallbackState *) arg;
1169 :
1170 18 : if (geterrcode() != ERRCODE_QUERY_CANCELED)
1171 18 : (void) scanner_errposition(scbstate->location, scbstate->yyscanner);
1172 18 : }
1173 :
1174 : /*
1175 : * setup_scanner_errposition_callback
1176 : * Arrange for non-scanner errors to report an error position
1177 : *
1178 : * Sometimes the scanner calls functions that aren't part of the scanner
1179 : * subsystem and can't reasonably be passed the yyscanner pointer; yet
1180 : * we would like any errors thrown in those functions to be tagged with an
1181 : * error location. Use this function to set up an error context stack
1182 : * entry that will accomplish that. Usage pattern:
1183 : *
1184 : * declare a local variable "ScannerCallbackState scbstate"
1185 : * ...
1186 : * setup_scanner_errposition_callback(&scbstate, yyscanner, location);
1187 : * call function that might throw error;
1188 : * cancel_scanner_errposition_callback(&scbstate);
1189 : */
1190 : void
1191 256 : setup_scanner_errposition_callback(ScannerCallbackState *scbstate,
1192 : core_yyscan_t yyscanner,
1193 : int location)
1194 : {
1195 : /* Setup error traceback support for ereport() */
1196 256 : scbstate->yyscanner = yyscanner;
1197 256 : scbstate->location = location;
1129 tgl 1198 CBC 256 : scbstate->errcallback.callback = scb_error_callback;
1129 tgl 1199 GIC 256 : scbstate->errcallback.arg = (void *) scbstate;
1200 256 : scbstate->errcallback.previous = error_context_stack;
1201 256 : error_context_stack = &scbstate->errcallback;
1129 tgl 1202 CBC 256 : }
1129 tgl 1203 EUB :
1204 : /*
1205 : * Cancel a previously-set-up errposition callback.
1129 tgl 1206 ECB : */
1207 : void
1129 tgl 1208 CBC 238 : cancel_scanner_errposition_callback(ScannerCallbackState *scbstate)
1209 : {
1210 : /* Pop the error context stack */
1129 tgl 1211 GIC 238 : error_context_stack = scbstate->errcallback.previous;
1212 238 : }
1213 :
1214 : /*
1215 : * scanner_yyerror
1216 : * Report a lexer or grammar error.
1217 : *
1218 : * The message's cursor position is whatever YYLLOC was last set to,
5018 tgl 1219 ECB : * ie, the start of the current token if called within yylex(), or the
1220 : * most recently lexed token if called from the grammar.
6235 1221 : * This is OK for syntax error messages from the Bison parser, because Bison
1222 : * parsers report error as soon as the first unparsable token is reached.
1223 : * Beware of using yyerror for other purposes, as the cursor position might
1224 : * be misleading!
1225 : */
1226 : void
4899 tgl 1227 GIC 424 : scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1228 : {
5018 1229 424 : const char *loc = yyextra->scanbuf + *yylloc;
1230 :
7539 1231 424 : if (*loc == YY_END_OF_BUFFER_CHAR)
1232 : {
7290 1233 9 : ereport(ERROR,
1234 : (errcode(ERRCODE_SYNTAX_ERROR),
1235 : /* translator: %s is typically the translation of "syntax error" */
1236 : errmsg("%s at end of input", _(message)),
1237 : lexer_errposition()));
1238 : }
1239 : else
1240 : {
1241 415 : ereport(ERROR,
1242 : (errcode(ERRCODE_SYNTAX_ERROR),
1243 : /* translator: first %s is typically the translation of "syntax error" */
6620 bruce 1244 ECB : errmsg("%s at or near \"%s\"", _(message), loc),
1245 : lexer_errposition()));
1246 : }
1247 : }
1248 :
9770 scrappy 1249 :
1250 : /*
7659 peter_e 1251 : * Called before any actual parsing is done
1252 : */
4899 tgl 1253 : core_yyscan_t
5017 tgl 1254 CBC 511001 : scanner_init(const char *str,
4899 tgl 1255 ECB : core_yy_extra_type *yyext,
1256 : const ScanKeywordList *keywordlist,
1257 : const uint16 *keyword_tokens)
1258 : {
5018 tgl 1259 GIC 511001 : Size slen = strlen(str);
1260 : yyscan_t scanner;
7287 tgl 1261 ECB :
5018 tgl 1262 GIC 511001 : if (yylex_init(&scanner) != 0)
5018 tgl 1263 UIC 0 : elog(ERROR, "yylex_init() failed: %m");
5018 tgl 1264 ECB :
4899 tgl 1265 CBC 511001 : core_yyset_extra(yyext, scanner);
1266 :
1554 tgl 1267 GIC 511001 : yyext->keywordlist = keywordlist;
1268 511001 : yyext->keyword_tokens = keyword_tokens;
1269 :
2999 1270 511001 : yyext->backslash_quote = backslash_quote;
1271 511001 : yyext->escape_string_warning = escape_string_warning;
1272 511001 : yyext->standard_conforming_strings = standard_conforming_strings;
1273 :
1274 : /*
1275 : * Make a scan buffer with special termination needed by flex.
1276 : */
5018 1277 511001 : yyext->scanbuf = (char *) palloc(slen + 2);
1278 511001 : yyext->scanbuflen = slen;
1279 511001 : memcpy(yyext->scanbuf, str, slen);
5018 tgl 1280 CBC 511001 : yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
5018 tgl 1281 GIC 511001 : yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
8574 tgl 1282 ECB :
1283 : /* initialize literal buffer to a reasonable but expansible size */
5018 tgl 1284 CBC 511001 : yyext->literalalloc = 1024;
5018 tgl 1285 GIC 511001 : yyext->literalbuf = (char *) palloc(yyext->literalalloc);
5018 tgl 1286 CBC 511001 : yyext->literallen = 0;
1287 :
5018 tgl 1288 GIC 511001 : return scanner;
1289 : }
1290 :
1291 :
1292 : /*
1293 : * Called after parsing is done to clean up after scanner_init()
7659 peter_e 1294 ECB : */
1295 : void
4899 tgl 1296 GIC 510379 : scanner_finish(core_yyscan_t yyscanner)
1297 : {
1298 : /*
1299 : * We don't bother to call yylex_destroy(), because all it would do is
1300 : * pfree a small amount of control storage. It's cheaper to leak the
1301 : * storage until the parsing context is destroyed. The amount of space
1302 : * involved is usually negligible compared to the output parse tree
1303 : * anyway.
1304 : *
1305 : * We do bother to pfree the scanbuf and literal buffer, but only if they
1306 : * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
5018 tgl 1307 ECB : */
5018 tgl 1308 GIC 510379 : if (yyextra->scanbuflen >= 8192)
1309 39 : pfree(yyextra->scanbuf);
1310 510379 : if (yyextra->literalalloc >= 8192)
1311 9 : pfree(yyextra->literalbuf);
7659 peter_e 1312 CBC 510379 : }
1313 :
1314 :
8574 tgl 1315 ECB : static void
4899 tgl 1316 GBC 607875 : addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1317 : {
8574 tgl 1318 ECB : /* enlarge buffer if needed */
5018 tgl 1319 GIC 607875 : if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
8574 tgl 1320 ECB : {
647 drowley 1321 CBC 112 : yyextra->literalalloc = pg_nextpower2_32(yyextra->literallen + yleng + 1);
5018 tgl 1322 GIC 112 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
5018 tgl 1323 CBC 112 : yyextra->literalalloc);
8574 tgl 1324 ECB : }
5018 1325 : /* append new data */
5018 tgl 1326 GIC 607875 : memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1327 607875 : yyextra->literallen += yleng;
8574 1328 607875 : }
1329 :
8574 tgl 1330 ECB :
7659 peter_e 1331 : static void
4899 tgl 1332 CBC 11127 : addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
9770 scrappy 1333 ECB : {
7659 peter_e 1334 : /* enlarge buffer if needed */
5018 tgl 1335 GIC 11127 : if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1336 : {
5018 tgl 1337 LBC 0 : yyextra->literalalloc *= 2;
1338 0 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1339 0 : yyextra->literalalloc);
1340 : }
5018 tgl 1341 ECB : /* append new data */
5018 tgl 1342 GIC 11127 : yyextra->literalbuf[yyextra->literallen] = ychar;
1343 11127 : yyextra->literallen += 1;
9770 scrappy 1344 11127 : }
1345 :
1346 :
1347 : /*
1348 : * Create a palloc'd copy of literalbuf, adding a trailing null.
7659 peter_e 1349 ECB : */
1350 : static char *
4899 tgl 1351 GIC 613436 : litbufdup(core_yyscan_t yyscanner)
1352 : {
5018 1353 613436 : int llen = yyextra->literallen;
1354 : char *new;
1355 :
1356 613436 : new = palloc(llen + 1);
1357 613436 : memcpy(new, yyextra->literalbuf, llen);
1358 613436 : new[llen] = '\0';
7659 peter_e 1359 613436 : return new;
1360 : }
9770 scrappy 1361 ECB :
1608 tgl 1362 : /*
1363 : * Process {decinteger}, {hexinteger}, etc. Note this will also do the right
1364 : * thing with {numeric}, ie digits and a decimal point.
1365 : */
1366 : static int
116 peter 1367 GNC 305897 : process_integer_literal(const char *token, YYSTYPE *lval, int base)
1368 : {
64 dean.a.rasheed 1369 305897 : ErrorSaveContext escontext = {T_ErrorSaveContext};
1370 : int32 val;
1371 :
1372 305897 : val = pg_strtoint32_safe(token, (Node *) &escontext);
1373 305897 : if (escontext.error_occurred)
4896 tgl 1374 ECB : {
1608 1375 : /* integer too large (or contains decimal pt), treat it as a float */
4896 tgl 1376 GIC 675 : lval->str = pstrdup(token);
1377 675 : return FCONST;
4896 tgl 1378 ECB : }
4896 tgl 1379 CBC 305222 : lval->ival = val;
1380 305222 : return ICONST;
1381 : }
1382 :
1383 : static void
4899 1384 57 : addunicode(pg_wchar c, core_yyscan_t yyscanner)
1385 : {
1386 : ScannerCallbackState scbstate;
1129 tgl 1387 ECB : char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
1388 :
1129 tgl 1389 GBC 57 : if (!is_valid_unicode_codepoint(c))
4899 1390 3 : yyerror("invalid Unicode escape value");
1129 tgl 1391 EUB :
1392 : /*
1393 : * We expect that pg_unicode_to_server() will complain about any
1129 tgl 1394 ECB : * unconvertible code point, so we don't have to set saw_non_ascii.
1395 : */
1129 tgl 1396 CBC 54 : setup_scanner_errposition_callback(&scbstate, yyscanner, *(yylloc));
1129 tgl 1397 GIC 54 : pg_unicode_to_server(c, (unsigned char *) buf);
1398 54 : cancel_scanner_errposition_callback(&scbstate);
1399 54 : addlit(buf, strlen(buf), yyscanner);
4899 1400 54 : }
1401 :
1402 : static unsigned char
4899 tgl 1403 CBC 3455 : unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1404 : {
7659 peter_e 1405 3455 : switch (c)
1406 : {
7659 peter_e 1407 GIC 13 : case 'b':
7659 peter_e 1408 CBC 13 : return '\b';
1409 1 : case 'f':
1410 1 : return '\f';
1411 560 : case 'n':
7659 peter_e 1412 GIC 560 : return '\n';
1413 36 : case 'r':
1414 36 : return '\r';
1415 10 : case 't':
1416 10 : return '\t';
1417 2835 : default:
1418 : /* check for backslash followed by non-7-bit-ASCII */
5103 tgl 1419 CBC 2835 : if (c == '\0' || IS_HIGHBIT_SET(c))
5018 tgl 1420 UIC 0 : yyextra->saw_non_ascii = true;
5103 tgl 1421 ECB :
7659 peter_e 1422 GIC 2835 : return c;
1423 : }
9770 scrappy 1424 ECB : }
6496 bruce 1425 :
1426 : static void
4899 tgl 1427 GIC 3455 : check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
6243 bruce 1428 ECB : {
6243 bruce 1429 CBC 3455 : if (ychar == '\'')
1430 : {
2999 tgl 1431 18 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
6243 bruce 1432 LBC 0 : ereport(WARNING,
1433 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1434 : errmsg("nonstandard use of \\' in a string literal"),
1435 : errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
6235 tgl 1436 ECB : lexer_errposition()));
5018 tgl 1437 GIC 18 : yyextra->warn_on_first_escape = false; /* warn only once per string */
1438 : }
6243 bruce 1439 3437 : else if (ychar == '\\')
1440 : {
2999 tgl 1441 CBC 2817 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
6243 bruce 1442 31 : ereport(WARNING,
1443 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1444 : errmsg("nonstandard use of \\\\ in a string literal"),
1445 : errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1446 : lexer_errposition()));
5018 tgl 1447 GIC 2817 : yyextra->warn_on_first_escape = false; /* warn only once per string */
6243 bruce 1448 ECB : }
1449 : else
5018 tgl 1450 CBC 620 : check_escape_warning(yyscanner);
6243 bruce 1451 3455 : }
6243 bruce 1452 ECB :
1453 : static void
4899 tgl 1454 GIC 746 : check_escape_warning(core_yyscan_t yyscanner)
6496 bruce 1455 ECB : {
2999 tgl 1456 GIC 746 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
6496 bruce 1457 LBC 0 : ereport(WARNING,
1458 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
6496 tgl 1459 ECB : errmsg("nonstandard use of escape in a string literal"),
2577 1460 : errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
6235 1461 : lexer_errposition()));
2577 tgl 1462 CBC 746 : yyextra->warn_on_first_escape = false; /* warn only once per string */
5018 1463 746 : }
5018 tgl 1464 ECB :
1465 : /*
1466 : * Interface functions to make flex use palloc() instead of malloc().
1467 : * It'd be better to make these static, but flex insists otherwise.
1468 : */
1469 :
1470 : void *
4899 tgl 1471 CBC 1533003 : core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
5018 tgl 1472 EUB : {
5018 tgl 1473 GIC 1533003 : return palloc(bytes);
5018 tgl 1474 ECB : }
1475 :
1476 : void *
4899 tgl 1477 UIC 0 : core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1478 : {
5018 tgl 1479 LBC 0 : if (ptr)
5018 tgl 1480 UIC 0 : return repalloc(ptr, bytes);
5018 tgl 1481 ECB : else
5018 tgl 1482 UIC 0 : return palloc(bytes);
5018 tgl 1483 ECB : }
5018 tgl 1484 EUB :
1485 : void
4899 tgl 1486 UIC 0 : core_yyfree(void *ptr, core_yyscan_t yyscanner)
1487 : {
5018 1488 0 : if (ptr)
5018 tgl 1489 LBC 0 : pfree(ptr);
6496 bruce 1490 UIC 0 : }
|