Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * parser.c
4 : * Main entry point/driver for PostgreSQL grammar
5 : *
6 : * This should match src/backend/parser/parser.c, except that we do not
7 : * need to bother with re-entrant interfaces.
8 : *
9 : * Note: ECPG doesn't report error location like the backend does.
10 : * This file will need work if we ever want it to.
11 : *
12 : *
13 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
14 : * Portions Copyright (c) 1994, Regents of the University of California
15 : *
16 : * IDENTIFICATION
17 : * src/interfaces/ecpg/preproc/parser.c
18 : *
19 : *-------------------------------------------------------------------------
20 : */
21 :
22 : #include "postgres_fe.h"
23 :
24 : #include "preproc_extern.h"
25 : #include "preproc.h"
26 :
27 :
28 : static bool have_lookahead; /* is lookahead info valid? */
29 : static int lookahead_token; /* one-token lookahead */
30 : static YYSTYPE lookahead_yylval; /* yylval for lookahead token */
31 : static YYLTYPE lookahead_yylloc; /* yylloc for lookahead token */
32 : static char *lookahead_yytext; /* start current token */
33 :
34 : static bool check_uescapechar(unsigned char escape);
35 : static bool ecpg_isspace(char ch);
36 :
37 :
38 : /*
39 : * Intermediate filter between parser and base lexer (base_yylex in scan.l).
40 : *
41 : * This filter is needed because in some cases the standard SQL grammar
42 : * requires more than one token lookahead. We reduce these cases to one-token
43 : * lookahead by replacing tokens here, in order to keep the grammar LALR(1).
44 : *
45 : * Using a filter is simpler than trying to recognize multiword tokens
46 : * directly in scan.l, because we'd have to allow for comments between the
47 : * words. Furthermore it's not clear how to do that without re-introducing
48 : * scanner backtrack, which would cost more performance than this filter
49 : * layer does.
50 : *
51 : * We also use this filter to convert UIDENT and USCONST sequences into
52 : * plain IDENT and SCONST tokens. While that could be handled by additional
53 : * productions in the main grammar, it's more efficient to do it like this.
54 : */
55 : int
5644 tgl 56 CBC 34710 : filtered_base_yylex(void)
57 : {
58 : int cur_token;
59 : int next_token;
60 : YYSTYPE cur_yylval;
61 : YYLTYPE cur_yylloc;
62 : char *cur_yytext;
63 :
64 : /* Get next token --- we might already have it */
65 34710 : if (have_lookahead)
66 : {
67 51 : cur_token = lookahead_token;
68 51 : base_yylval = lookahead_yylval;
69 51 : base_yylloc = lookahead_yylloc;
2310 70 51 : base_yytext = lookahead_yytext;
5644 71 51 : have_lookahead = false;
72 : }
73 : else
74 34659 : cur_token = base_yylex();
75 :
76 : /*
77 : * If this token isn't one that requires lookahead, just return it.
78 : */
79 34710 : switch (cur_token)
80 : {
11 alvherre 81 GNC 52 : case FORMAT:
2951 tgl 82 ECB : case NOT:
83 : case NULLS_P:
84 : case WITH:
85 : case WITHOUT:
86 : case UIDENT:
87 : case USCONST:
2966 tgl 88 GIC 52 : break;
89 34658 : default:
2966 tgl 90 CBC 34658 : return cur_token;
2966 tgl 91 ECB : }
92 :
93 : /* Save and restore lexer output variables around the call */
2966 tgl 94 GIC 52 : cur_yylval = base_yylval;
95 52 : cur_yylloc = base_yylloc;
2310 tgl 96 CBC 52 : cur_yytext = base_yytext;
2966 tgl 97 ECB :
98 : /* Get next token, saving outputs into lookahead variables */
2966 tgl 99 GIC 52 : next_token = base_yylex();
100 :
2966 tgl 101 CBC 52 : lookahead_token = next_token;
2966 tgl 102 GIC 52 : lookahead_yylval = base_yylval;
2966 tgl 103 CBC 52 : lookahead_yylloc = base_yylloc;
2310 104 52 : lookahead_yytext = base_yytext;
2966 tgl 105 ECB :
2966 tgl 106 CBC 52 : base_yylval = cur_yylval;
2966 tgl 107 GIC 52 : base_yylloc = cur_yylloc;
2310 tgl 108 CBC 52 : base_yytext = cur_yytext;
2966 tgl 109 ECB :
2966 tgl 110 CBC 52 : have_lookahead = true;
111 :
2966 tgl 112 ECB : /* Replace cur_token if needed, based on lookahead */
2966 tgl 113 GIC 52 : switch (cur_token)
114 : {
11 alvherre 115 GNC 2 : case FORMAT:
116 : /* Replace FORMAT by FORMAT_LA if it's followed by JSON */
117 : switch (next_token)
118 : {
119 2 : case JSON:
120 2 : cur_token = FORMAT_LA;
121 2 : break;
122 : }
123 2 : break;
124 :
2951 tgl 125 CBC 35 : case NOT:
126 : /* Replace NOT by NOT_LA if it's followed by BETWEEN, IN, etc */
2951 tgl 127 ECB : switch (next_token)
128 : {
2951 tgl 129 UIC 0 : case BETWEEN:
130 : case IN_P:
2951 tgl 131 ECB : case LIKE:
132 : case ILIKE:
133 : case SIMILAR:
2951 tgl 134 UIC 0 : cur_token = NOT_LA;
2951 tgl 135 LBC 0 : break;
136 : }
2951 tgl 137 CBC 35 : break;
138 :
2966 tgl 139 GIC 2 : case NULLS_P:
140 : /* Replace NULLS_P by NULLS_LA if it's followed by FIRST or LAST */
5644 tgl 141 EUB : switch (next_token)
142 : {
5644 tgl 143 GIC 2 : case FIRST_P:
144 : case LAST_P:
2966 145 2 : cur_token = NULLS_LA;
5644 tgl 146 GBC 2 : break;
5644 tgl 147 EUB : }
5644 tgl 148 GIC 2 : break;
5644 tgl 149 ECB :
5644 tgl 150 GIC 6 : case WITH:
2966 tgl 151 ECB : /* Replace WITH by WITH_LA if it's followed by TIME or ORDINALITY */
152 : switch (next_token)
153 : {
5276 peter_e 154 GIC 1 : case TIME:
3541 stark 155 ECB : case ORDINALITY:
2966 tgl 156 GIC 1 : cur_token = WITH_LA;
5644 tgl 157 CBC 1 : break;
5644 tgl 158 ECB : }
5644 tgl 159 GIC 6 : break;
160 :
11 alvherre 161 GNC 4 : case WITHOUT:
162 : /* Replace WITHOUT by WITHOUT_LA if it's followed by TIME */
163 : switch (next_token)
164 : {
5 165 1 : case TIME:
11 166 1 : cur_token = WITHOUT_LA;
167 1 : break;
168 : }
169 4 : break;
1182 tgl 170 CBC 3 : case UIDENT:
171 : case USCONST:
1182 tgl 172 ECB : /* Look ahead for UESCAPE */
1182 tgl 173 GIC 3 : if (next_token == UESCAPE)
174 : {
175 : /* Yup, so get third token, which had better be SCONST */
1182 tgl 176 ECB : const char *escstr;
177 :
178 : /*
179 : * Again save and restore lexer output variables around the
180 : * call
181 : */
1182 tgl 182 GIC 1 : cur_yylval = base_yylval;
1182 tgl 183 CBC 1 : cur_yylloc = base_yylloc;
1182 tgl 184 GIC 1 : cur_yytext = base_yytext;
185 :
186 : /* Get third token */
1182 tgl 187 CBC 1 : next_token = base_yylex();
1182 tgl 188 ECB :
1182 tgl 189 CBC 1 : if (next_token != SCONST)
1182 tgl 190 UIC 0 : mmerror(PARSE_ERROR, ET_ERROR, "UESCAPE must be followed by a simple string literal");
1182 tgl 191 ECB :
192 : /*
193 : * Save and check escape string, which the scanner returns
194 : * with quotes
195 : */
1182 tgl 196 GIC 1 : escstr = base_yylval.str;
197 1 : if (strlen(escstr) != 3 || !check_uescapechar(escstr[1]))
1182 tgl 198 UIC 0 : mmerror(PARSE_ERROR, ET_ERROR, "invalid Unicode escape character");
199 :
1182 tgl 200 GIC 1 : base_yylval = cur_yylval;
201 1 : base_yylloc = cur_yylloc;
202 1 : base_yytext = cur_yytext;
203 :
1182 tgl 204 ECB : /* Combine 3 tokens into 1 */
1182 tgl 205 CBC 1 : base_yylval.str = psprintf("%s UESCAPE %s", base_yylval.str, escstr);
1182 tgl 206 ECB :
207 : /* Clear have_lookahead, thereby consuming all three tokens */
1182 tgl 208 GIC 1 : have_lookahead = false;
1182 tgl 209 ECB : }
210 :
1182 tgl 211 CBC 3 : if (cur_token == UIDENT)
1182 tgl 212 GBC 1 : cur_token = IDENT;
1182 tgl 213 GIC 2 : else if (cur_token == USCONST)
214 2 : cur_token = SCONST;
215 3 : break;
216 : }
217 :
5644 tgl 218 CBC 52 : return cur_token;
5644 tgl 219 ECB : }
1182 tgl 220 EUB :
221 : /*
1182 tgl 222 ECB : * check_uescapechar() and ecpg_isspace() should match their equivalents
223 : * in pgc.l.
224 : */
225 :
226 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
227 : static bool
1182 tgl 228 GIC 1 : check_uescapechar(unsigned char escape)
229 : {
1182 tgl 230 CBC 1 : if (isxdigit(escape)
1182 tgl 231 GIC 1 : || escape == '+'
232 1 : || escape == '\''
1182 tgl 233 CBC 1 : || escape == '"'
234 1 : || ecpg_isspace(escape))
1182 tgl 235 LBC 0 : return false;
1182 tgl 236 ECB : else
1182 tgl 237 CBC 1 : return true;
238 : }
239 :
1182 tgl 240 ECB : /*
241 : * ecpg_isspace() --- return true if flex scanner considers char whitespace
242 : */
243 : static bool
1182 tgl 244 GIC 1 : ecpg_isspace(char ch)
245 : {
246 1 : if (ch == ' ' ||
247 1 : ch == '\t' ||
248 1 : ch == '\n' ||
249 1 : ch == '\r' ||
1182 tgl 250 ECB : ch == '\f')
1182 tgl 251 UIC 0 : return true;
1182 tgl 252 CBC 1 : return false;
1182 tgl 253 ECB : }
|