Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * wparser_def.c
4 : * Default text search parser
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/wparser_def.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include <limits.h>
18 :
19 : #include "catalog/pg_collation.h"
20 : #include "commands/defrem.h"
21 : #include "tsearch/ts_locale.h"
22 : #include "tsearch/ts_public.h"
23 : #include "tsearch/ts_type.h"
24 : #include "tsearch/ts_utils.h"
25 : #include "utils/builtins.h"
26 :
27 :
28 : /* Define me to enable tracing of parser behavior */
29 : /* #define WPARSER_TRACE */
30 :
31 :
32 : /* Output token categories */
33 :
34 : #define ASCIIWORD 1
35 : #define WORD_T 2
36 : #define NUMWORD 3
37 : #define EMAIL 4
38 : #define URL_T 5
39 : #define HOST 6
40 : #define SCIENTIFIC 7
41 : #define VERSIONNUMBER 8
42 : #define NUMPARTHWORD 9
43 : #define PARTHWORD 10
44 : #define ASCIIPARTHWORD 11
45 : #define SPACE 12
46 : #define TAG_T 13
47 : #define PROTOCOL 14
48 : #define NUMHWORD 15
49 : #define ASCIIHWORD 16
50 : #define HWORD 17
51 : #define URLPATH 18
52 : #define FILEPATH 19
53 : #define DECIMAL_T 20
54 : #define SIGNEDINT 21
55 : #define UNSIGNEDINT 22
56 : #define XMLENTITY 23
57 :
58 : #define LASTNUM 23
59 :
60 : static const char *const tok_alias[] = {
61 : "",
62 : "asciiword",
63 : "word",
64 : "numword",
65 : "email",
66 : "url",
67 : "host",
68 : "sfloat",
69 : "version",
70 : "hword_numpart",
71 : "hword_part",
72 : "hword_asciipart",
73 : "blank",
74 : "tag",
75 : "protocol",
76 : "numhword",
77 : "asciihword",
78 : "hword",
79 : "url_path",
80 : "file",
81 : "float",
82 : "int",
83 : "uint",
84 : "entity"
85 : };
86 :
87 : static const char *const lex_descr[] = {
88 : "",
89 : "Word, all ASCII",
90 : "Word, all letters",
91 : "Word, letters and digits",
92 : "Email address",
93 : "URL",
94 : "Host",
95 : "Scientific notation",
96 : "Version number",
97 : "Hyphenated word part, letters and digits",
98 : "Hyphenated word part, all letters",
99 : "Hyphenated word part, all ASCII",
100 : "Space symbols",
101 : "XML tag",
102 : "Protocol head",
103 : "Hyphenated word, letters and digits",
104 : "Hyphenated word, all ASCII",
105 : "Hyphenated word, all letters",
106 : "URL path",
107 : "File or path name",
108 : "Decimal notation",
109 : "Signed integer",
110 : "Unsigned integer",
111 : "XML entity"
112 : };
113 :
114 :
115 : /* Parser states */
116 :
117 : typedef enum
118 : {
119 : TPS_Base = 0,
120 : TPS_InNumWord,
121 : TPS_InAsciiWord,
122 : TPS_InWord,
123 : TPS_InUnsignedInt,
124 : TPS_InSignedIntFirst,
125 : TPS_InSignedInt,
126 : TPS_InSpace,
127 : TPS_InUDecimalFirst,
128 : TPS_InUDecimal,
129 : TPS_InDecimalFirst,
130 : TPS_InDecimal,
131 : TPS_InVerVersion,
132 : TPS_InSVerVersion,
133 : TPS_InVersionFirst,
134 : TPS_InVersion,
135 : TPS_InMantissaFirst,
136 : TPS_InMantissaSign,
137 : TPS_InMantissa,
138 : TPS_InXMLEntityFirst,
139 : TPS_InXMLEntity,
140 : TPS_InXMLEntityNumFirst,
141 : TPS_InXMLEntityNum,
142 : TPS_InXMLEntityHexNumFirst,
143 : TPS_InXMLEntityHexNum,
144 : TPS_InXMLEntityEnd,
145 : TPS_InTagFirst,
146 : TPS_InXMLBegin,
147 : TPS_InTagCloseFirst,
148 : TPS_InTagName,
149 : TPS_InTagBeginEnd,
150 : TPS_InTag,
151 : TPS_InTagEscapeK,
152 : TPS_InTagEscapeKK,
153 : TPS_InTagBackSleshed,
154 : TPS_InTagEnd,
155 : TPS_InCommentFirst,
156 : TPS_InCommentLast,
157 : TPS_InComment,
158 : TPS_InCloseCommentFirst,
159 : TPS_InCloseCommentLast,
160 : TPS_InCommentEnd,
161 : TPS_InHostFirstDomain,
162 : TPS_InHostDomainSecond,
163 : TPS_InHostDomain,
164 : TPS_InPortFirst,
165 : TPS_InPort,
166 : TPS_InHostFirstAN,
167 : TPS_InHost,
168 : TPS_InEmail,
169 : TPS_InFileFirst,
170 : TPS_InFileTwiddle,
171 : TPS_InPathFirst,
172 : TPS_InPathFirstFirst,
173 : TPS_InPathSecond,
174 : TPS_InFile,
175 : TPS_InFileNext,
176 : TPS_InURLPathFirst,
177 : TPS_InURLPathStart,
178 : TPS_InURLPath,
179 : TPS_InFURL,
180 : TPS_InProtocolFirst,
181 : TPS_InProtocolSecond,
182 : TPS_InProtocolEnd,
183 : TPS_InHyphenAsciiWordFirst,
184 : TPS_InHyphenAsciiWord,
185 : TPS_InHyphenWordFirst,
186 : TPS_InHyphenWord,
187 : TPS_InHyphenNumWordFirst,
188 : TPS_InHyphenNumWord,
189 : TPS_InHyphenDigitLookahead,
190 : TPS_InParseHyphen,
191 : TPS_InParseHyphenHyphen,
192 : TPS_InHyphenWordPart,
193 : TPS_InHyphenAsciiWordPart,
194 : TPS_InHyphenNumWordPart,
195 : TPS_InHyphenUnsignedInt,
196 : TPS_Null /* last state (fake value) */
197 : } TParserState;
198 :
199 : /* forward declaration */
200 : struct TParser;
201 :
202 : typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203 : * except p_iseq */
204 : typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205 : * special cases... */
206 :
207 : typedef struct
208 : {
209 : TParserCharTest isclass;
210 : char c;
211 : uint16 flags;
212 : TParserState tostate;
213 : int type;
214 : TParserSpecial special;
215 : } TParserStateActionItem;
216 :
217 : /* Flag bits in TParserStateActionItem.flags */
218 : #define A_NEXT 0x0000
219 : #define A_BINGO 0x0001
220 : #define A_POP 0x0002
221 : #define A_PUSH 0x0004
222 : #define A_RERUN 0x0008
223 : #define A_CLEAR 0x0010
224 : #define A_MERGE 0x0020
225 : #define A_CLRALL 0x0040
226 :
227 : typedef struct TParserPosition
228 : {
229 : int posbyte; /* position of parser in bytes */
230 : int poschar; /* position of parser in characters */
231 : int charlen; /* length of current char */
232 : int lenbytetoken; /* length of token-so-far in bytes */
233 : int lenchartoken; /* and in chars */
234 : TParserState state;
235 : struct TParserPosition *prev;
236 : const TParserStateActionItem *pushedAtAction;
237 : } TParserPosition;
238 :
239 : typedef struct TParser
240 : {
241 : /* string and position information */
242 : char *str; /* multibyte string */
243 : int lenstr; /* length of mbstring */
244 : wchar_t *wstr; /* wide character string */
245 : pg_wchar *pgwstr; /* wide character string for C-locale */
246 : bool usewide;
247 :
248 : /* State of parse */
249 : int charmaxlen;
250 : TParserPosition *state;
251 : bool ignore;
252 : bool wanthost;
253 :
254 : /* silly char */
255 : char c;
256 :
257 : /* out */
258 : char *token;
259 : int lenbytetoken;
260 : int lenchartoken;
261 : int type;
262 : } TParser;
263 :
264 :
265 : /* forward decls here */
266 : static bool TParserGet(TParser *prs);
267 :
268 :
269 : static TParserPosition *
5624 bruce 270 CBC 5104 : newTParserPosition(TParserPosition *prev)
271 : {
5710 tgl 272 5104 : TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273 :
274 5104 : if (prev)
275 2619 : memcpy(res, prev, sizeof(TParserPosition));
276 : else
277 2485 : memset(res, 0, sizeof(TParserPosition));
278 :
279 5104 : res->prev = prev;
280 :
281 5104 : res->pushedAtAction = NULL;
282 :
283 5104 : return res;
284 : }
285 :
286 : static TParser *
287 2365 : TParserInit(char *str, int len)
288 : {
289 2365 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
290 :
291 2365 : prs->charmaxlen = pg_database_encoding_max_length();
292 2365 : prs->str = str;
293 2365 : prs->lenstr = len;
294 :
295 : /*
296 : * Use wide char code only when max encoding length > 1.
297 : */
298 2365 : if (prs->charmaxlen > 1)
299 : {
2118 300 2365 : pg_locale_t mylocale = 0; /* TODO */
301 :
5710 302 2365 : prs->usewide = true;
23 jdavis 303 2365 : if (database_ctype_is_c)
304 : {
305 : /*
306 : * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
307 : * be different from sizeof(wchar_t)
308 : */
4382 bruce 309 787 : prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
5151 teodor 310 787 : pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
311 : }
312 : else
313 : {
314 1578 : prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
4369 tgl 315 1578 : char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
316 : mylocale);
317 : }
318 : }
319 : else
5710 tgl 320 UBC 0 : prs->usewide = false;
321 :
5710 tgl 322 CBC 2365 : prs->state = newTParserPosition(NULL);
323 2365 : prs->state->state = TPS_Base;
324 :
325 : #ifdef WPARSER_TRACE
326 : fprintf(stderr, "parsing \"%.*s\"\n", len, str);
327 : #endif
328 :
329 2365 : return prs;
330 : }
331 :
332 : /*
333 : * As an alternative to a full TParserInit one can create a
334 : * TParserCopy which basically is a regular TParser without a private
335 : * copy of the string - instead it uses the one from another TParser.
336 : * This is useful because at some places TParsers are created
337 : * recursively and the repeated copying around of the strings can
338 : * cause major inefficiency if the source string is long.
339 : * The new parser starts parsing at the original's current position.
340 : *
341 : * Obviously one must not close the original TParser before the copy.
342 : */
343 : static TParser *
4863 344 120 : TParserCopyInit(const TParser *orig)
345 : {
346 120 : TParser *prs = (TParser *) palloc0(sizeof(TParser));
347 :
348 120 : prs->charmaxlen = orig->charmaxlen;
349 120 : prs->str = orig->str + orig->state->posbyte;
350 120 : prs->lenstr = orig->lenstr - orig->state->posbyte;
351 120 : prs->usewide = orig->usewide;
352 :
353 120 : if (orig->pgwstr)
354 40 : prs->pgwstr = orig->pgwstr + orig->state->poschar;
355 120 : if (orig->wstr)
356 80 : prs->wstr = orig->wstr + orig->state->poschar;
357 :
358 120 : prs->state = newTParserPosition(NULL);
359 120 : prs->state->state = TPS_Base;
360 :
361 : #ifdef WPARSER_TRACE
362 : fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
363 : #endif
364 :
365 120 : return prs;
366 : }
367 :
368 :
369 : static void
5624 bruce 370 2365 : TParserClose(TParser *prs)
371 : {
5710 tgl 372 4730 : while (prs->state)
373 : {
374 2365 : TParserPosition *ptr = prs->state->prev;
375 :
376 2365 : pfree(prs->state);
377 2365 : prs->state = ptr;
378 : }
379 :
380 2365 : if (prs->wstr)
381 1578 : pfree(prs->wstr);
5151 teodor 382 2365 : if (prs->pgwstr)
383 787 : pfree(prs->pgwstr);
384 :
385 : #ifdef WPARSER_TRACE
386 : fprintf(stderr, "closing parser\n");
387 : #endif
5710 tgl 388 2365 : pfree(prs);
389 2365 : }
390 :
391 : /*
392 : * Close a parser created with TParserCopyInit
393 : */
394 : static void
4863 395 120 : TParserCopyClose(TParser *prs)
396 : {
397 306 : while (prs->state)
398 : {
399 186 : TParserPosition *ptr = prs->state->prev;
400 :
401 186 : pfree(prs->state);
402 186 : prs->state = ptr;
403 : }
404 :
405 : #ifdef WPARSER_TRACE
406 : fprintf(stderr, "closing parser copy\n");
407 : #endif
408 120 : pfree(prs);
409 120 : }
410 :
411 :
412 : /*
413 : * Character-type support functions, equivalent to is* macros, but
414 : * working with any possible encodings and locales. Notes:
415 : * - with multibyte encoding and C-locale isw* function may fail
416 : * or give wrong result.
417 : * - multibyte encoding and C-locale often are used for
418 : * Asian languages.
419 : * - if locale is C then we use pgwstr instead of wstr.
420 : */
421 :
422 : #define p_iswhat(type, nonascii) \
423 : \
424 : static int \
425 : p_is##type(TParser *prs) \
426 : { \
427 : Assert(prs->state); \
428 : if (prs->usewide) \
429 : { \
430 : if (prs->pgwstr) \
431 : { \
432 : unsigned int c = *(prs->pgwstr + prs->state->poschar); \
433 : if (c > 0x7f) \
434 : return nonascii; \
435 : return is##type(c); \
436 : } \
437 : return isw##type(*(prs->wstr + prs->state->poschar)); \
438 : } \
439 : return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
440 : } \
441 : \
442 : static int \
443 : p_isnot##type(TParser *prs) \
444 : { \
445 : return !p_is##type(prs); \
446 : }
447 :
448 : /*
449 : * In C locale with a multibyte encoding, any non-ASCII symbol is considered
450 : * an alpha character, but not a member of other char classes.
451 : */
2025 452 12561 : p_iswhat(alnum, 1)
453 46886 : p_iswhat(alpha, 1)
454 18566 : p_iswhat(digit, 0)
2025 tgl 455 UBC 0 : p_iswhat(lower, 0)
456 0 : p_iswhat(print, 0)
457 0 : p_iswhat(punct, 0)
2025 tgl 458 CBC 339 : p_iswhat(space, 0)
2025 tgl 459 UBC 0 : p_iswhat(upper, 0)
2025 tgl 460 CBC 9 : p_iswhat(xdigit, 0)
461 :
462 : /* p_iseq should be used only for ascii symbols */
463 :
464 : static int
5624 bruce 465 115684 : p_iseq(TParser *prs, char c)
466 : {
5710 tgl 467 115684 : Assert(prs->state);
468 115684 : return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
469 : }
470 :
471 : static int
5624 bruce 472 50025 : p_isEOF(TParser *prs)
473 : {
5710 tgl 474 50025 : Assert(prs->state);
475 50025 : return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
476 : }
477 :
478 : static int
5624 bruce 479 115684 : p_iseqC(TParser *prs)
480 : {
5710 tgl 481 115684 : return p_iseq(prs, prs->c);
482 : }
483 :
484 : static int
5624 bruce 485 UBC 0 : p_isneC(TParser *prs)
486 : {
5710 tgl 487 0 : return !p_iseq(prs, prs->c);
488 : }
489 :
490 : static int
5624 bruce 491 CBC 36730 : p_isascii(TParser *prs)
492 : {
5710 tgl 493 36730 : return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
494 : }
495 :
496 : static int
5624 bruce 497 36730 : p_isasclet(TParser *prs)
498 : {
5647 tgl 499 36730 : return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
500 : }
501 :
502 : static int
4729 503 1329 : p_isurlchar(TParser *prs)
504 : {
505 : char ch;
506 :
507 : /* no non-ASCII need apply */
508 1329 : if (prs->state->charlen != 1)
4729 tgl 509 UBC 0 : return 0;
4729 tgl 510 CBC 1329 : ch = *(prs->str + prs->state->posbyte);
511 : /* no spaces or control characters */
512 1329 : if (ch <= 0x20 || ch >= 0x7F)
513 117 : return 0;
514 : /* reject characters disallowed by RFC 3986 */
515 1212 : switch (ch)
516 : {
517 12 : case '"':
518 : case '<':
519 : case '>':
520 : case '\\':
521 : case '^':
522 : case '`':
523 : case '{':
524 : case '|':
525 : case '}':
526 12 : return 0;
527 : }
528 1200 : return 1;
529 : }
530 :
531 :
532 : /* deliberately suppress unused-function complaints for the above */
533 : void _make_compiler_happy(void);
534 : void
5710 tgl 535 UBC 0 : _make_compiler_happy(void)
536 : {
537 0 : p_isalnum(NULL);
538 0 : p_isnotalnum(NULL);
539 0 : p_isalpha(NULL);
540 0 : p_isnotalpha(NULL);
541 0 : p_isdigit(NULL);
542 0 : p_isnotdigit(NULL);
543 0 : p_islower(NULL);
544 0 : p_isnotlower(NULL);
545 0 : p_isprint(NULL);
546 0 : p_isnotprint(NULL);
547 0 : p_ispunct(NULL);
548 0 : p_isnotpunct(NULL);
549 0 : p_isspace(NULL);
550 0 : p_isnotspace(NULL);
551 0 : p_isupper(NULL);
552 0 : p_isnotupper(NULL);
553 0 : p_isxdigit(NULL);
554 0 : p_isnotxdigit(NULL);
555 0 : p_isEOF(NULL);
556 0 : p_iseqC(NULL);
557 0 : p_isneC(NULL);
558 0 : }
559 :
560 :
561 : static void
5624 bruce 562 CBC 126 : SpecialTags(TParser *prs)
563 : {
5643 tgl 564 126 : switch (prs->state->lenchartoken)
565 : {
2118 566 3 : case 8: /* </script */
5643 567 3 : if (pg_strncasecmp(prs->token, "</script", 8) == 0)
5710 568 3 : prs->ignore = false;
569 3 : break;
2118 570 12 : case 7: /* <script || </style */
5643 571 12 : if (pg_strncasecmp(prs->token, "</style", 7) == 0)
5710 tgl 572 UBC 0 : prs->ignore = false;
5643 tgl 573 CBC 12 : else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
5710 574 3 : prs->ignore = true;
575 12 : break;
2118 576 9 : case 6: /* <style */
5643 577 9 : if (pg_strncasecmp(prs->token, "<style", 6) == 0)
5710 tgl 578 UBC 0 : prs->ignore = true;
5710 tgl 579 CBC 9 : break;
580 102 : default:
581 102 : break;
582 : }
583 126 : }
584 :
585 : static void
5624 bruce 586 66 : SpecialFURL(TParser *prs)
587 : {
5710 tgl 588 66 : prs->wanthost = true;
5643 589 66 : prs->state->posbyte -= prs->state->lenbytetoken;
590 66 : prs->state->poschar -= prs->state->lenchartoken;
5710 591 66 : }
592 :
593 : static void
5624 bruce 594 18 : SpecialHyphen(TParser *prs)
595 : {
5643 tgl 596 18 : prs->state->posbyte -= prs->state->lenbytetoken;
597 18 : prs->state->poschar -= prs->state->lenchartoken;
5710 598 18 : }
599 :
600 : static void
5624 bruce 601 UBC 0 : SpecialVerVersion(TParser *prs)
602 : {
5643 tgl 603 0 : prs->state->posbyte -= prs->state->lenbytetoken;
604 0 : prs->state->poschar -= prs->state->lenchartoken;
605 0 : prs->state->lenbytetoken = 0;
606 0 : prs->state->lenchartoken = 0;
5710 607 0 : }
608 :
609 : static int
5624 bruce 610 CBC 240 : p_isstophost(TParser *prs)
611 : {
5710 tgl 612 240 : if (prs->wanthost)
613 : {
614 102 : prs->wanthost = false;
615 102 : return 1;
616 : }
617 138 : return 0;
618 : }
619 :
620 : static int
5624 bruce 621 18031 : p_isignore(TParser *prs)
622 : {
5710 tgl 623 18031 : return (prs->ignore) ? 1 : 0;
624 : }
625 :
626 : static int
5624 bruce 627 45 : p_ishost(TParser *prs)
628 : {
4382 629 45 : TParser *tmpprs = TParserCopyInit(prs);
5710 tgl 630 45 : int res = 0;
631 :
5143 teodor 632 45 : tmpprs->wanthost = true;
633 :
5710 tgl 634 45 : if (TParserGet(tmpprs) && tmpprs->type == HOST)
635 : {
5643 636 36 : prs->state->posbyte += tmpprs->lenbytetoken;
637 36 : prs->state->poschar += tmpprs->lenchartoken;
638 36 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
639 36 : prs->state->lenchartoken += tmpprs->lenchartoken;
5710 640 36 : prs->state->charlen = tmpprs->state->charlen;
641 36 : res = 1;
642 : }
4863 643 45 : TParserCopyClose(tmpprs);
644 :
5710 645 45 : return res;
646 : }
647 :
648 : static int
5624 bruce 649 75 : p_isURLPath(TParser *prs)
650 : {
4382 651 75 : TParser *tmpprs = TParserCopyInit(prs);
5710 tgl 652 75 : int res = 0;
653 :
654 75 : tmpprs->state = newTParserPosition(tmpprs->state);
4729 655 75 : tmpprs->state->state = TPS_InURLPathFirst;
656 :
657 75 : if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
658 : {
5643 659 66 : prs->state->posbyte += tmpprs->lenbytetoken;
660 66 : prs->state->poschar += tmpprs->lenchartoken;
661 66 : prs->state->lenbytetoken += tmpprs->lenbytetoken;
662 66 : prs->state->lenchartoken += tmpprs->lenchartoken;
5710 663 66 : prs->state->charlen = tmpprs->state->charlen;
664 66 : res = 1;
665 : }
4863 666 75 : TParserCopyClose(tmpprs);
667 :
5710 668 75 : return res;
669 : }
670 :
671 : /*
672 : * returns true if current character has zero display length or
673 : * it's a special sign in several languages. Such characters
674 : * aren't a word-breaker although they aren't an isalpha.
675 : * In beginning of word they aren't a part of it.
676 : */
677 : static int
5142 teodor 678 4362 : p_isspecial(TParser *prs)
679 : {
680 : /*
681 : * pg_dsplen could return -1 which means error or control character
682 : */
4382 bruce 683 4362 : if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
5142 teodor 684 UBC 0 : return 1;
685 :
686 : /*
687 : * Unicode Characters in the 'Mark, Spacing Combining' Category That
688 : * characters are not alpha although they are not breakers of word too.
689 : * Check that only in utf encoding, because other encodings aren't
690 : * supported by postgres or even exists.
691 : */
4382 bruce 692 CBC 4362 : if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
693 : {
694 : static const pg_wchar strange_letter[] = {
695 : /*
696 : * use binary search, so elements should be ordered
697 : */
698 : 0x0903, /* DEVANAGARI SIGN VISARGA */
699 : 0x093E, /* DEVANAGARI VOWEL SIGN AA */
700 : 0x093F, /* DEVANAGARI VOWEL SIGN I */
701 : 0x0940, /* DEVANAGARI VOWEL SIGN II */
702 : 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
703 : 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
704 : 0x094B, /* DEVANAGARI VOWEL SIGN O */
705 : 0x094C, /* DEVANAGARI VOWEL SIGN AU */
706 : 0x0982, /* BENGALI SIGN ANUSVARA */
707 : 0x0983, /* BENGALI SIGN VISARGA */
708 : 0x09BE, /* BENGALI VOWEL SIGN AA */
709 : 0x09BF, /* BENGALI VOWEL SIGN I */
710 : 0x09C0, /* BENGALI VOWEL SIGN II */
711 : 0x09C7, /* BENGALI VOWEL SIGN E */
712 : 0x09C8, /* BENGALI VOWEL SIGN AI */
713 : 0x09CB, /* BENGALI VOWEL SIGN O */
714 : 0x09CC, /* BENGALI VOWEL SIGN AU */
715 : 0x09D7, /* BENGALI AU LENGTH MARK */
716 : 0x0A03, /* GURMUKHI SIGN VISARGA */
717 : 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
718 : 0x0A3F, /* GURMUKHI VOWEL SIGN I */
719 : 0x0A40, /* GURMUKHI VOWEL SIGN II */
720 : 0x0A83, /* GUJARATI SIGN VISARGA */
721 : 0x0ABE, /* GUJARATI VOWEL SIGN AA */
722 : 0x0ABF, /* GUJARATI VOWEL SIGN I */
723 : 0x0AC0, /* GUJARATI VOWEL SIGN II */
724 : 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
725 : 0x0ACB, /* GUJARATI VOWEL SIGN O */
726 : 0x0ACC, /* GUJARATI VOWEL SIGN AU */
727 : 0x0B02, /* ORIYA SIGN ANUSVARA */
728 : 0x0B03, /* ORIYA SIGN VISARGA */
729 : 0x0B3E, /* ORIYA VOWEL SIGN AA */
730 : 0x0B40, /* ORIYA VOWEL SIGN II */
731 : 0x0B47, /* ORIYA VOWEL SIGN E */
732 : 0x0B48, /* ORIYA VOWEL SIGN AI */
733 : 0x0B4B, /* ORIYA VOWEL SIGN O */
734 : 0x0B4C, /* ORIYA VOWEL SIGN AU */
735 : 0x0B57, /* ORIYA AU LENGTH MARK */
736 : 0x0BBE, /* TAMIL VOWEL SIGN AA */
737 : 0x0BBF, /* TAMIL VOWEL SIGN I */
738 : 0x0BC1, /* TAMIL VOWEL SIGN U */
739 : 0x0BC2, /* TAMIL VOWEL SIGN UU */
740 : 0x0BC6, /* TAMIL VOWEL SIGN E */
741 : 0x0BC7, /* TAMIL VOWEL SIGN EE */
742 : 0x0BC8, /* TAMIL VOWEL SIGN AI */
743 : 0x0BCA, /* TAMIL VOWEL SIGN O */
744 : 0x0BCB, /* TAMIL VOWEL SIGN OO */
745 : 0x0BCC, /* TAMIL VOWEL SIGN AU */
746 : 0x0BD7, /* TAMIL AU LENGTH MARK */
747 : 0x0C01, /* TELUGU SIGN CANDRABINDU */
748 : 0x0C02, /* TELUGU SIGN ANUSVARA */
749 : 0x0C03, /* TELUGU SIGN VISARGA */
750 : 0x0C41, /* TELUGU VOWEL SIGN U */
751 : 0x0C42, /* TELUGU VOWEL SIGN UU */
752 : 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
753 : 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
754 : 0x0C82, /* KANNADA SIGN ANUSVARA */
755 : 0x0C83, /* KANNADA SIGN VISARGA */
756 : 0x0CBE, /* KANNADA VOWEL SIGN AA */
757 : 0x0CC0, /* KANNADA VOWEL SIGN II */
758 : 0x0CC1, /* KANNADA VOWEL SIGN U */
759 : 0x0CC2, /* KANNADA VOWEL SIGN UU */
760 : 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
761 : 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
762 : 0x0CC7, /* KANNADA VOWEL SIGN EE */
763 : 0x0CC8, /* KANNADA VOWEL SIGN AI */
764 : 0x0CCA, /* KANNADA VOWEL SIGN O */
765 : 0x0CCB, /* KANNADA VOWEL SIGN OO */
766 : 0x0CD5, /* KANNADA LENGTH MARK */
767 : 0x0CD6, /* KANNADA AI LENGTH MARK */
768 : 0x0D02, /* MALAYALAM SIGN ANUSVARA */
769 : 0x0D03, /* MALAYALAM SIGN VISARGA */
770 : 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
771 : 0x0D3F, /* MALAYALAM VOWEL SIGN I */
772 : 0x0D40, /* MALAYALAM VOWEL SIGN II */
773 : 0x0D46, /* MALAYALAM VOWEL SIGN E */
774 : 0x0D47, /* MALAYALAM VOWEL SIGN EE */
775 : 0x0D48, /* MALAYALAM VOWEL SIGN AI */
776 : 0x0D4A, /* MALAYALAM VOWEL SIGN O */
777 : 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
778 : 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
779 : 0x0D57, /* MALAYALAM AU LENGTH MARK */
780 : 0x0D82, /* SINHALA SIGN ANUSVARAYA */
781 : 0x0D83, /* SINHALA SIGN VISARGAYA */
782 : 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
783 : 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
784 : 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
785 : 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
786 : 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
787 : 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
788 : 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
789 : 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
790 : 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
791 : * AELA-PILLA */
792 : 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
793 : 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
794 : 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
795 : 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
796 : 0x0F3E, /* TIBETAN SIGN YAR TSHES */
797 : 0x0F3F, /* TIBETAN SIGN MAR TSHES */
798 : 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
799 : 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
800 : 0x102C, /* MYANMAR VOWEL SIGN AA */
801 : 0x1031, /* MYANMAR VOWEL SIGN E */
802 : 0x1038, /* MYANMAR SIGN VISARGA */
803 : 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
804 : 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
805 : 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
806 : 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
807 : 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
808 : 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
809 : 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
810 : 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
811 : 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
812 : 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
813 : 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
814 : 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
815 : 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
816 : 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
817 : 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
818 : 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
819 : 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
820 : 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
821 : 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
822 : 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
823 : 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
824 : 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
825 : 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
826 : 0x17B6, /* KHMER VOWEL SIGN AA */
827 : 0x17BE, /* KHMER VOWEL SIGN OE */
828 : 0x17BF, /* KHMER VOWEL SIGN YA */
829 : 0x17C0, /* KHMER VOWEL SIGN IE */
830 : 0x17C1, /* KHMER VOWEL SIGN E */
831 : 0x17C2, /* KHMER VOWEL SIGN AE */
832 : 0x17C3, /* KHMER VOWEL SIGN AI */
833 : 0x17C4, /* KHMER VOWEL SIGN OO */
834 : 0x17C5, /* KHMER VOWEL SIGN AU */
835 : 0x17C7, /* KHMER SIGN REAHMUK */
836 : 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
837 : 0x1923, /* LIMBU VOWEL SIGN EE */
838 : 0x1924, /* LIMBU VOWEL SIGN AI */
839 : 0x1925, /* LIMBU VOWEL SIGN OO */
840 : 0x1926, /* LIMBU VOWEL SIGN AU */
841 : 0x1929, /* LIMBU SUBJOINED LETTER YA */
842 : 0x192A, /* LIMBU SUBJOINED LETTER RA */
843 : 0x192B, /* LIMBU SUBJOINED LETTER WA */
844 : 0x1930, /* LIMBU SMALL LETTER KA */
845 : 0x1931, /* LIMBU SMALL LETTER NGA */
846 : 0x1933, /* LIMBU SMALL LETTER TA */
847 : 0x1934, /* LIMBU SMALL LETTER NA */
848 : 0x1935, /* LIMBU SMALL LETTER PA */
849 : 0x1936, /* LIMBU SMALL LETTER MA */
850 : 0x1937, /* LIMBU SMALL LETTER RA */
851 : 0x1938, /* LIMBU SMALL LETTER LA */
852 : 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
853 : 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
854 : 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
855 : 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
856 : 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
857 : 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
858 : 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
859 : 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
860 : 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
861 : 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
862 : 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
863 : 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
864 : 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
865 : 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
866 : 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
867 : 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
868 : 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
869 : 0x19C8, /* NEW TAI LUE TONE MARK-1 */
870 : 0x19C9, /* NEW TAI LUE TONE MARK-2 */
871 : 0x1A19, /* BUGINESE VOWEL SIGN E */
872 : 0x1A1A, /* BUGINESE VOWEL SIGN O */
873 : 0x1A1B, /* BUGINESE VOWEL SIGN AE */
874 : 0x1B04, /* BALINESE SIGN BISAH */
875 : 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
876 : 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
877 : 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
878 : 0x1B3E, /* BALINESE VOWEL SIGN TALING */
879 : 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
880 : 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
881 : 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
882 : 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
883 : 0x1B44, /* BALINESE ADEG ADEG */
884 : 0x1B82, /* SUNDANESE SIGN PANGWISAD */
885 : 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
886 : 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
887 : 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
888 : 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
889 : 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
890 : 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
891 : 0x1C26, /* LEPCHA VOWEL SIGN AA */
892 : 0x1C27, /* LEPCHA VOWEL SIGN I */
893 : 0x1C28, /* LEPCHA VOWEL SIGN O */
894 : 0x1C29, /* LEPCHA VOWEL SIGN OO */
895 : 0x1C2A, /* LEPCHA VOWEL SIGN U */
896 : 0x1C2B, /* LEPCHA VOWEL SIGN UU */
897 : 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
898 : 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
899 : 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
900 : 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
901 : 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
902 : 0xA880, /* SAURASHTRA SIGN ANUSVARA */
903 : 0xA881, /* SAURASHTRA SIGN VISARGA */
904 : 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
905 : 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
906 : 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
907 : 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
908 : 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
909 : 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
910 : 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
911 : 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
912 : 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
913 : 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
914 : 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
915 : 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
916 : 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
917 : 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
918 : 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
919 : 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
920 : 0xA952, /* REJANG CONSONANT SIGN H */
921 : 0xA953, /* REJANG VIRAMA */
922 : 0xAA2F, /* CHAM VOWEL SIGN O */
923 : 0xAA30, /* CHAM VOWEL SIGN AI */
924 : 0xAA33, /* CHAM CONSONANT SIGN YA */
925 : 0xAA34, /* CHAM CONSONANT SIGN RA */
926 : 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
927 : };
3368 tgl 928 4362 : const pg_wchar *StopLow = strange_letter,
4382 bruce 929 4362 : *StopHigh = strange_letter + lengthof(strange_letter),
930 : *StopMiddle;
931 : pg_wchar c;
932 :
933 4362 : if (prs->pgwstr)
5142 teodor 934 1454 : c = *(prs->pgwstr + prs->state->poschar);
935 : else
936 2908 : c = (pg_wchar) *(prs->wstr + prs->state->poschar);
937 :
4382 bruce 938 39258 : while (StopLow < StopHigh)
939 : {
5142 teodor 940 34896 : StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
4382 bruce 941 34896 : if (*StopMiddle == c)
5142 teodor 942 UBC 0 : return 1;
4382 bruce 943 CBC 34896 : else if (*StopMiddle < c)
5142 teodor 944 UBC 0 : StopLow = StopMiddle + 1;
945 : else
5142 teodor 946 CBC 34896 : StopHigh = StopMiddle;
947 : }
948 : }
949 :
950 4362 : return 0;
951 : }
952 :
953 : /*
954 : * Table of state/action of parser
955 : */
956 :
957 : static const TParserStateActionItem actionTPS_Base[] = {
958 : {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
959 : {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
960 : {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
961 : {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
962 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
963 : {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
964 : {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
965 : {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
966 : {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
967 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
968 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
969 : {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
970 : {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
971 : };
972 :
973 :
974 : static const TParserStateActionItem actionTPS_InNumWord[] = {
975 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
976 : {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
977 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
978 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
979 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
980 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
981 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
982 : {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
983 : };
984 :
985 : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
986 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
987 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
988 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
989 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
990 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
991 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
992 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
993 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
994 : {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
995 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
996 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
997 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
998 : {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
999 : {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1000 : {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1001 : };
1002 :
1003 : static const TParserStateActionItem actionTPS_InWord[] = {
1004 : {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1005 : {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1006 : {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1007 : {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1008 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1009 : {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1010 : };
1011 :
1012 : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1013 : {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1014 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1015 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1016 : {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1017 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1018 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1019 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1020 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1021 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1022 : {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1023 : {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1024 : {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1025 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1026 : {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1027 : };
1028 :
1029 : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1030 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1031 : {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1032 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1033 : };
1034 :
1035 : static const TParserStateActionItem actionTPS_InSignedInt[] = {
1036 : {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1037 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1038 : {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1039 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1040 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1041 : {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1042 : };
1043 :
1044 : static const TParserStateActionItem actionTPS_InSpace[] = {
1045 : {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1046 : {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1047 : {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1048 : {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1049 : {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1050 : {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1051 : {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1052 : {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1053 : {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1054 : };
1055 :
1056 : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1057 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1058 : {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1059 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1060 : };
1061 :
1062 : static const TParserStateActionItem actionTPS_InUDecimal[] = {
1063 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1064 : {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1065 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1066 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1067 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1068 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1069 : };
1070 :
1071 : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1072 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1073 : {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1074 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1075 : };
1076 :
1077 : static const TParserStateActionItem actionTPS_InDecimal[] = {
1078 : {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1079 : {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1080 : {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1081 : {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1082 : {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1083 : {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1084 : };
1085 :
1086 : static const TParserStateActionItem actionTPS_InVerVersion[] = {
1087 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1088 : {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1089 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1090 : };
1091 :
1092 : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1093 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1094 : {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1095 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1096 : };
1097 :
1098 :
1099 : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1100 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1101 : {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1102 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1103 : };
1104 :
1105 : static const TParserStateActionItem actionTPS_InVersion[] = {
1106 : {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1107 : {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1108 : {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1109 : {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1110 : };
1111 :
1112 : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1113 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1114 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1115 : {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1116 : {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1117 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1118 : };
1119 :
1120 : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1121 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122 : {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1124 : };
1125 :
1126 : static const TParserStateActionItem actionTPS_InMantissa[] = {
1127 : {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1128 : {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1129 : {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1130 : };
1131 :
1132 : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1133 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1134 : {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1135 : {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1136 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1137 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1138 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1139 : };
1140 :
1141 : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1142 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1143 : {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144 : {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145 : {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146 : {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1147 : {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1148 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1149 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1150 : };
1151 :
1152 : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1153 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1154 : {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1155 : {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1156 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1157 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 : };
1159 :
1160 : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1161 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1163 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1164 : };
1165 :
1166 : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1167 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1168 : {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1169 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1170 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1171 : };
1172 :
1173 : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1174 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1175 : {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1176 : {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1177 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1178 : };
1179 :
1180 : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1181 : {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1182 : };
1183 :
1184 : static const TParserStateActionItem actionTPS_InTagFirst[] = {
1185 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1186 : {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1187 : {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1188 : {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1189 : {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1190 : {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1191 : {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1192 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1193 : };
1194 :
1195 : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1196 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1197 : /* <?xml ... */
1198 : /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1199 : {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1200 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 : };
1202 :
1203 : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1204 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205 : {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1206 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1207 : };
1208 :
1209 : static const TParserStateActionItem actionTPS_InTagName[] = {
1210 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1211 : /* <br/> case */
1212 : {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1213 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1214 : {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1215 : {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1216 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1217 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1218 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1219 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1220 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1221 : };
1222 :
1223 : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1224 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1225 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1226 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1227 : };
1228 :
1229 : static const TParserStateActionItem actionTPS_InTag[] = {
1230 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1231 : {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1232 : {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1233 : {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1234 : {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1235 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1236 : {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1237 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1238 : {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1239 : {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1240 : {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1241 : {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1242 : {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1243 : {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1244 : {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1245 : {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1246 : {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1247 : {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1248 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1249 : };
1250 :
1251 : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1252 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1253 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1254 : {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1255 : {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1256 : };
1257 :
1258 : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1259 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1260 : {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1261 : {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1262 : {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1263 : };
1264 :
1265 : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1266 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267 : {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1268 : };
1269 :
1270 : static const TParserStateActionItem actionTPS_InTagEnd[] = {
1271 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1272 : };
1273 :
1274 : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1275 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1276 : {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1277 : /* <!DOCTYPE ...> */
1278 : {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1279 : {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1280 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1281 : };
1282 :
1283 : static const TParserStateActionItem actionTPS_InCommentLast[] = {
1284 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1285 : {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1286 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1287 : };
1288 :
1289 : static const TParserStateActionItem actionTPS_InComment[] = {
1290 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1292 : {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1293 : };
1294 :
1295 : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1296 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1297 : {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1298 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1299 : };
1300 :
1301 : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1302 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1303 : {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1304 : {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1305 : {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1306 : };
1307 :
1308 : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1309 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1310 : };
1311 :
1312 : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1313 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1314 : {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1315 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1316 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1317 : };
1318 :
1319 : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1320 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1321 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1322 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1323 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1324 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1325 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1326 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1327 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1328 : };
1329 :
1330 : static const TParserStateActionItem actionTPS_InHostDomain[] = {
1331 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1332 : {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1333 : {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1334 : {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1335 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1336 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1337 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1338 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1339 : {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1340 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1341 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1342 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1343 : };
1344 :
1345 : static const TParserStateActionItem actionTPS_InPortFirst[] = {
1346 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1347 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1348 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1349 : };
1350 :
1351 : static const TParserStateActionItem actionTPS_InPort[] = {
1352 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1353 : {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1354 : {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1355 : {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1356 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1357 : };
1358 :
1359 : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1360 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1361 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1362 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1363 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1364 : };
1365 :
1366 : static const TParserStateActionItem actionTPS_InHost[] = {
1367 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1368 : {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1369 : {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1370 : {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1371 : {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1372 : {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1373 : {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1374 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1375 : };
1376 :
1377 : static const TParserStateActionItem actionTPS_InEmail[] = {
1378 : {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1379 : {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1380 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1381 : };
1382 :
1383 : static const TParserStateActionItem actionTPS_InFileFirst[] = {
1384 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1385 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1386 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1387 : {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1388 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1389 : {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1390 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1391 : };
1392 :
1393 : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1394 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1395 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1396 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1397 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1398 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1399 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1400 : };
1401 :
1402 : static const TParserStateActionItem actionTPS_InPathFirst[] = {
1403 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1404 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1405 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1406 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1407 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1408 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1409 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1410 : };
1411 :
1412 : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1413 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1414 : {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1415 : {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1416 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1417 : };
1418 :
1419 : static const TParserStateActionItem actionTPS_InPathSecond[] = {
1420 : {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1421 : {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1422 : {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1423 : {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1424 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 : };
1426 :
1427 : static const TParserStateActionItem actionTPS_InFile[] = {
1428 : {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1429 : {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1430 : {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1431 : {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1432 : {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1433 : {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1434 : {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1435 : {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1436 : };
1437 :
1438 : static const TParserStateActionItem actionTPS_InFileNext[] = {
1439 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1440 : {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1441 : {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1442 : {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1443 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1444 : };
1445 :
1446 : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1447 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1449 : {NULL, 0, A_POP, TPS_Null, 0, NULL},
1450 : };
1451 :
1452 : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1453 : {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1454 : };
1455 :
1456 : static const TParserStateActionItem actionTPS_InURLPath[] = {
1457 : {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1458 : {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1459 : {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1460 : };
1461 :
1462 : static const TParserStateActionItem actionTPS_InFURL[] = {
1463 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1464 : {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1465 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1466 : };
1467 :
1468 : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1469 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1470 : {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1471 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1472 : };
1473 :
1474 : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1475 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1476 : {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1477 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1478 : };
1479 :
1480 : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1481 : {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1482 : };
1483 :
1484 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1485 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1486 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1487 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1488 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1489 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1490 : };
1491 :
1492 : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1493 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1494 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1497 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1498 : {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1499 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1500 : };
1501 :
1502 : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1503 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1504 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1506 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1507 : };
1508 :
1509 : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1510 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1511 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1512 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1514 : {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1515 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1516 : };
1517 :
1518 : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1519 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1520 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1521 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1522 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1523 : };
1524 :
1525 : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1526 : {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1527 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1528 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529 : {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1530 : {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1531 : };
1532 :
1533 : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1534 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1535 : {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1536 : {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1538 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1539 : };
1540 :
1541 : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1542 : {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1543 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1544 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1545 : {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1546 : {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1547 : {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1548 : };
1549 :
1550 : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1551 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1552 : {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1553 : {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1554 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1555 : };
1556 :
1557 : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1558 : {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1559 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1560 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1561 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1562 : {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1563 : };
1564 :
1565 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1566 : {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1567 : {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1568 : {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569 : {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1570 : {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1571 : {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1572 : };
1573 :
1574 : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1575 : {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1576 : {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1577 : {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1578 : {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1579 : };
1580 :
1581 : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1582 : {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1583 : {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1584 : {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1585 : {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1586 : {NULL, 0, A_POP, TPS_Null, 0, NULL}
1587 : };
1588 :
1589 :
1590 : /*
1591 : * main table of per-state parser actions
1592 : */
1593 : typedef struct
1594 : {
1595 : const TParserStateActionItem *action; /* the actual state info */
1596 : TParserState state; /* only for Assert crosscheck */
1597 : #ifdef WPARSER_TRACE
1598 : const char *state_name; /* only for debug printout */
1599 : #endif
1600 : } TParserStateAction;
1601 :
1602 : #ifdef WPARSER_TRACE
1603 : #define TPARSERSTATEACTION(state) \
1604 : { CppConcat(action,state), state, CppAsString(state) }
1605 : #else
1606 : #define TPARSERSTATEACTION(state) \
1607 : { CppConcat(action,state), state }
1608 : #endif
1609 :
1610 : /*
1611 : * order must be the same as in typedef enum {} TParserState!!
1612 : */
1613 :
1614 : static const TParserStateAction Actions[] = {
1615 : TPARSERSTATEACTION(TPS_Base),
1616 : TPARSERSTATEACTION(TPS_InNumWord),
1617 : TPARSERSTATEACTION(TPS_InAsciiWord),
1618 : TPARSERSTATEACTION(TPS_InWord),
1619 : TPARSERSTATEACTION(TPS_InUnsignedInt),
1620 : TPARSERSTATEACTION(TPS_InSignedIntFirst),
1621 : TPARSERSTATEACTION(TPS_InSignedInt),
1622 : TPARSERSTATEACTION(TPS_InSpace),
1623 : TPARSERSTATEACTION(TPS_InUDecimalFirst),
1624 : TPARSERSTATEACTION(TPS_InUDecimal),
1625 : TPARSERSTATEACTION(TPS_InDecimalFirst),
1626 : TPARSERSTATEACTION(TPS_InDecimal),
1627 : TPARSERSTATEACTION(TPS_InVerVersion),
1628 : TPARSERSTATEACTION(TPS_InSVerVersion),
1629 : TPARSERSTATEACTION(TPS_InVersionFirst),
1630 : TPARSERSTATEACTION(TPS_InVersion),
1631 : TPARSERSTATEACTION(TPS_InMantissaFirst),
1632 : TPARSERSTATEACTION(TPS_InMantissaSign),
1633 : TPARSERSTATEACTION(TPS_InMantissa),
1634 : TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1635 : TPARSERSTATEACTION(TPS_InXMLEntity),
1636 : TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1637 : TPARSERSTATEACTION(TPS_InXMLEntityNum),
1638 : TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1639 : TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1640 : TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1641 : TPARSERSTATEACTION(TPS_InTagFirst),
1642 : TPARSERSTATEACTION(TPS_InXMLBegin),
1643 : TPARSERSTATEACTION(TPS_InTagCloseFirst),
1644 : TPARSERSTATEACTION(TPS_InTagName),
1645 : TPARSERSTATEACTION(TPS_InTagBeginEnd),
1646 : TPARSERSTATEACTION(TPS_InTag),
1647 : TPARSERSTATEACTION(TPS_InTagEscapeK),
1648 : TPARSERSTATEACTION(TPS_InTagEscapeKK),
1649 : TPARSERSTATEACTION(TPS_InTagBackSleshed),
1650 : TPARSERSTATEACTION(TPS_InTagEnd),
1651 : TPARSERSTATEACTION(TPS_InCommentFirst),
1652 : TPARSERSTATEACTION(TPS_InCommentLast),
1653 : TPARSERSTATEACTION(TPS_InComment),
1654 : TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1655 : TPARSERSTATEACTION(TPS_InCloseCommentLast),
1656 : TPARSERSTATEACTION(TPS_InCommentEnd),
1657 : TPARSERSTATEACTION(TPS_InHostFirstDomain),
1658 : TPARSERSTATEACTION(TPS_InHostDomainSecond),
1659 : TPARSERSTATEACTION(TPS_InHostDomain),
1660 : TPARSERSTATEACTION(TPS_InPortFirst),
1661 : TPARSERSTATEACTION(TPS_InPort),
1662 : TPARSERSTATEACTION(TPS_InHostFirstAN),
1663 : TPARSERSTATEACTION(TPS_InHost),
1664 : TPARSERSTATEACTION(TPS_InEmail),
1665 : TPARSERSTATEACTION(TPS_InFileFirst),
1666 : TPARSERSTATEACTION(TPS_InFileTwiddle),
1667 : TPARSERSTATEACTION(TPS_InPathFirst),
1668 : TPARSERSTATEACTION(TPS_InPathFirstFirst),
1669 : TPARSERSTATEACTION(TPS_InPathSecond),
1670 : TPARSERSTATEACTION(TPS_InFile),
1671 : TPARSERSTATEACTION(TPS_InFileNext),
1672 : TPARSERSTATEACTION(TPS_InURLPathFirst),
1673 : TPARSERSTATEACTION(TPS_InURLPathStart),
1674 : TPARSERSTATEACTION(TPS_InURLPath),
1675 : TPARSERSTATEACTION(TPS_InFURL),
1676 : TPARSERSTATEACTION(TPS_InProtocolFirst),
1677 : TPARSERSTATEACTION(TPS_InProtocolSecond),
1678 : TPARSERSTATEACTION(TPS_InProtocolEnd),
1679 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1680 : TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1681 : TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1682 : TPARSERSTATEACTION(TPS_InHyphenWord),
1683 : TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1684 : TPARSERSTATEACTION(TPS_InHyphenNumWord),
1685 : TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1686 : TPARSERSTATEACTION(TPS_InParseHyphen),
1687 : TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1688 : TPARSERSTATEACTION(TPS_InHyphenWordPart),
1689 : TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1690 : TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1691 : TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1692 : };
1693 :
1694 :
1695 : static bool
5624 bruce 1696 14438 : TParserGet(TParser *prs)
1697 : {
5643 tgl 1698 14438 : const TParserStateActionItem *item = NULL;
1699 :
5647 1700 14438 : Assert(prs->state);
1701 :
5710 1702 14438 : if (prs->state->posbyte >= prs->lenstr)
1703 2365 : return false;
1704 :
5643 1705 12073 : prs->token = prs->str + prs->state->posbyte;
5710 1706 12073 : prs->state->pushedAtAction = NULL;
1707 :
1708 : /* look at string */
1709 51585 : while (prs->state->posbyte <= prs->lenstr)
1710 : {
1711 51585 : if (prs->state->posbyte == prs->lenstr)
1712 2440 : prs->state->charlen = 0;
1713 : else
1714 98290 : prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1715 49145 : pg_mblen(prs->str + prs->state->posbyte);
1716 :
1717 51585 : Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1718 51585 : Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1719 51585 : Assert(Actions[prs->state->state].state == prs->state->state);
1720 :
5643 1721 51585 : if (prs->state->pushedAtAction)
1722 : {
1723 : /* After a POP, pick up at the next test */
1724 1296 : item = prs->state->pushedAtAction + 1;
1725 1296 : prs->state->pushedAtAction = NULL;
1726 : }
1727 : else
1728 : {
1729 50289 : item = Actions[prs->state->state].action;
1730 50289 : Assert(item != NULL);
1731 : }
1732 :
1733 : /* find action by character class */
5710 1734 277734 : while (item->isclass)
1735 : {
1736 262062 : prs->c = item->c;
1737 262062 : if (item->isclass(prs) != 0)
5643 1738 35913 : break;
5710 1739 226149 : item++;
1740 : }
1741 :
1742 : #ifdef WPARSER_TRACE
1743 : {
1744 : TParserPosition *ptr;
1745 :
1746 : fprintf(stderr, "state ");
1747 : /* indent according to stack depth */
1748 : for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1749 : fprintf(stderr, " ");
1750 : fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1751 : if (prs->state->posbyte < prs->lenstr)
1752 : fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1753 : else
1754 : fprintf(stderr, "at EOF");
1755 : fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1756 : (int) (item - Actions[prs->state->state].action),
1757 : (item->flags & A_BINGO) ? " BINGO" : "",
1758 : (item->flags & A_POP) ? " POP" : "",
1759 : (item->flags & A_PUSH) ? " PUSH" : "",
1760 : (item->flags & A_RERUN) ? " RERUN" : "",
1761 : (item->flags & A_CLEAR) ? " CLEAR" : "",
1762 : (item->flags & A_MERGE) ? " MERGE" : "",
1763 : (item->flags & A_CLRALL) ? " CLRALL" : "",
1764 : (item->tostate != TPS_Null) ? " tostate " : "",
1765 : (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1766 : (item->type > 0) ? " type " : "",
1767 : tok_alias[item->type]);
1768 : }
1769 : #endif
1770 :
1771 : /* call special handler if exists */
1772 51585 : if (item->special)
1773 210 : item->special(prs);
1774 :
1775 : /* BINGO, token is found */
1776 51585 : if (item->flags & A_BINGO)
1777 : {
1778 12073 : Assert(item->type > 0);
5643 1779 12073 : prs->lenbytetoken = prs->state->lenbytetoken;
1780 12073 : prs->lenchartoken = prs->state->lenchartoken;
1781 12073 : prs->state->lenbytetoken = prs->state->lenchartoken = 0;
5710 1782 12073 : prs->type = item->type;
1783 : }
1784 :
1785 : /* do various actions by flags */
1786 51585 : if (item->flags & A_POP)
1787 : { /* pop stored state in stack */
1788 1305 : TParserPosition *ptr = prs->state->prev;
1789 :
1790 1305 : pfree(prs->state);
1791 1305 : prs->state = ptr;
1792 1305 : Assert(prs->state);
1793 : }
1794 50280 : else if (item->flags & A_PUSH)
1795 : { /* push (store) state in stack */
1796 2544 : prs->state->pushedAtAction = item; /* remember where we push */
1797 2544 : prs->state = newTParserPosition(prs->state);
1798 : }
1799 47736 : else if (item->flags & A_CLEAR)
1800 : { /* clear previous pushed state */
1801 : TParserPosition *ptr;
1802 :
1803 249 : Assert(prs->state->prev);
1804 249 : ptr = prs->state->prev->prev;
1805 249 : pfree(prs->state->prev);
1806 249 : prs->state->prev = ptr;
1807 : }
1808 47487 : else if (item->flags & A_CLRALL)
1809 : { /* clear all previous pushed state */
1810 : TParserPosition *ptr;
1811 :
1812 1389 : while (prs->state->prev)
1813 : {
1814 999 : ptr = prs->state->prev->prev;
1815 999 : pfree(prs->state->prev);
1816 999 : prs->state->prev = ptr;
1817 : }
1818 : }
1819 47097 : else if (item->flags & A_MERGE)
1820 : { /* merge posinfo with current and pushed state */
5710 tgl 1821 UBC 0 : TParserPosition *ptr = prs->state;
1822 :
1823 0 : Assert(prs->state->prev);
1824 0 : prs->state = prs->state->prev;
1825 :
1826 0 : prs->state->posbyte = ptr->posbyte;
1827 0 : prs->state->poschar = ptr->poschar;
1828 0 : prs->state->charlen = ptr->charlen;
5643 1829 0 : prs->state->lenbytetoken = ptr->lenbytetoken;
1830 0 : prs->state->lenchartoken = ptr->lenchartoken;
5710 1831 0 : pfree(ptr);
1832 : }
1833 :
1834 : /* set new state if pointed */
5710 tgl 1835 CBC 51585 : if (item->tostate != TPS_Null)
1836 33077 : prs->state->state = item->tostate;
1837 :
1838 : /* check for go away */
5647 1839 51585 : if ((item->flags & A_BINGO) ||
1840 39512 : (prs->state->posbyte >= prs->lenstr &&
5647 tgl 1841 UBC 0 : (item->flags & A_RERUN) == 0))
1842 : break;
1843 :
1844 : /* go to beginning of loop if we should rerun or we just restore state */
5710 tgl 1845 CBC 39512 : if (item->flags & (A_RERUN | A_POP))
1846 1317 : continue;
1847 :
1848 : /* move forward */
1849 38195 : if (prs->state->charlen)
1850 : {
1851 38195 : prs->state->posbyte += prs->state->charlen;
5643 1852 38195 : prs->state->lenbytetoken += prs->state->charlen;
5710 1853 38195 : prs->state->poschar++;
5643 1854 38195 : prs->state->lenchartoken++;
1855 : }
1856 : }
1857 :
545 michael 1858 12073 : return (item && (item->flags & A_BINGO));
1859 : }
1860 :
1861 : Datum
5710 tgl 1862 25703 : prsd_lextype(PG_FUNCTION_ARGS)
1863 : {
1864 25703 : LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1865 : int i;
1866 :
1867 616872 : for (i = 1; i <= LASTNUM; i++)
1868 : {
1869 591169 : descr[i - 1].lexid = i;
1870 591169 : descr[i - 1].alias = pstrdup(tok_alias[i]);
1871 591169 : descr[i - 1].descr = pstrdup(lex_descr[i]);
1872 : }
1873 :
1874 25703 : descr[LASTNUM].lexid = 0;
1875 :
1876 25703 : PG_RETURN_POINTER(descr);
1877 : }
1878 :
1879 : Datum
1880 2365 : prsd_start(PG_FUNCTION_ARGS)
1881 : {
1882 2365 : PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1883 : }
1884 :
1885 : Datum
1886 14318 : prsd_nexttoken(PG_FUNCTION_ARGS)
1887 : {
1888 14318 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1889 14318 : char **t = (char **) PG_GETARG_POINTER(1);
1890 14318 : int *tlen = (int *) PG_GETARG_POINTER(2);
1891 :
1892 14318 : if (!TParserGet(p))
1893 2365 : PG_RETURN_INT32(0);
1894 :
5643 1895 11953 : *t = p->token;
1896 11953 : *tlen = p->lenbytetoken;
1897 :
5710 1898 11953 : PG_RETURN_INT32(p->type);
1899 : }
1900 :
1901 : Datum
1902 2365 : prsd_end(PG_FUNCTION_ARGS)
1903 : {
1904 2365 : TParser *p = (TParser *) PG_GETARG_POINTER(0);
1905 :
1906 2365 : TParserClose(p);
1907 2365 : PG_RETURN_VOID();
1908 : }
1909 :
1910 :
1911 : /*
1912 : * ts_headline support begins here
1913 : */
1914 :
1915 : /* token type classification macros */
1916 : #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1917 : #define HLIDREPLACE(x) ( (x)==TAG_T )
1918 : #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1919 : #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1920 : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1921 : #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1922 :
1923 : /*
1924 : * Macros useful in headline selection. These rely on availability of
1925 : * "HeadlineParsedText *prs" describing some text, and "int shortword"
1926 : * describing the "short word" length parameter.
1927 : */
1928 :
1929 : /* Interesting words are non-repeated search terms */
1930 : #define INTERESTINGWORD(j) \
1931 : (prs->words[j].item && !prs->words[j].repeated)
1932 :
1933 : /* Don't want to end at a non-word or a short word, unless interesting */
1934 : #define BADENDPOINT(j) \
1935 : ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1936 : !INTERESTINGWORD(j))
1937 :
1938 : typedef struct
1939 : {
1940 : /* one cover (well, really one fragment) for mark_hl_fragments */
1941 : int32 startpos; /* fragment's starting word index */
1942 : int32 endpos; /* ending word index (inclusive) */
1943 : int32 poslen; /* number of interesting words */
1944 : int32 curlen; /* total number of words */
1945 : bool chosen; /* chosen? */
1946 : bool excluded; /* excluded? */
1947 : } CoverPos;
1948 :
1949 : typedef struct
1950 : {
1951 : /* callback data for checkcondition_HL */
1952 : HeadlineWordEntry *words;
1953 : int len;
1954 : } hlCheck;
1955 :
1956 :
1957 : /*
1958 : * TS_execute callback for matching a tsquery operand to headline words
1959 : *
1960 : * Note: it's tempting to report words[] indexes as pos values to save
1961 : * searching in hlCover; but that would screw up phrase matching, which
1962 : * expects to measure distances in lexemes not tokens.
1963 : */
1964 : static TSTernaryValue
2558 teodor 1965 GIC 500 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1966 : {
2495 rhaas 1967 500 : hlCheck *checkval = (hlCheck *) opaque;
1968 : int i;
5710 tgl 1969 ECB :
1970 : /* scan words array for matching items */
2558 teodor 1971 CBC 12725 : for (i = 0; i < checkval->len; i++)
1972 : {
2558 teodor 1973 GIC 12325 : if (checkval->words[i].item == val)
1974 : {
1095 tgl 1975 ECB : /* if data == NULL, don't need to report positions */
2558 teodor 1976 GIC 437 : if (!data)
989 tgl 1977 CBC 100 : return TS_YES;
1978 :
2558 teodor 1979 GIC 337 : if (!data->pos)
2558 teodor 1980 ECB : {
2558 teodor 1981 CBC 238 : data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
2558 teodor 1982 GIC 238 : data->allocated = true;
2558 teodor 1983 CBC 238 : data->npos = 1;
2558 teodor 1984 GIC 238 : data->pos[0] = checkval->words[i].pos;
2558 teodor 1985 ECB : }
2558 teodor 1986 CBC 99 : else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2558 teodor 1987 ECB : {
2558 teodor 1988 CBC 99 : data->pos[data->npos++] = checkval->words[i].pos;
1989 : }
2558 teodor 1990 ECB : }
1991 : }
1992 :
2558 teodor 1993 GIC 400 : if (data && data->npos > 0)
989 tgl 1994 238 : return TS_YES;
1995 :
1996 162 : return TS_NO;
5710 tgl 1997 ECB : }
1998 :
1999 : /*
2000 : * hlCover: try to find a substring of prs' word list that satisfies query
2001 : *
2002 : * locations is the result of TS_execute_locations() for the query.
2003 : * We use this to identify plausible subranges of the query.
2004 : *
2005 : * *nextpos is the lexeme position (NOT word index) to start the search
2006 : * at. Caller should initialize this to zero. If successful, we'll
2007 : * advance it to the next place to search at.
2008 : *
2009 : * On success, sets *p to first word index and *q to last word index of the
2010 : * cover substring, and returns true.
2011 : *
2012 : * The result is a minimal cover, in the sense that both *p and *q will be
2013 : * words used in the query.
2014 : */
2015 : static bool
80 tgl 2016 GNC 281 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
2017 : int *nextpos, int *p, int *q)
2018 : {
2019 281 : int pos = *nextpos;
5710 tgl 2020 ECB :
2021 : /* This loop repeats when our selected word-range fails the query */
2022 : for (;;)
5710 tgl 2023 CBC 30 : {
2024 : int posb,
2025 : pose;
2026 : ListCell *lc;
80 tgl 2027 ECB :
2028 : /*
2029 : * For each AND'ed query term or phrase, find its first occurrence at
2030 : * or after pos; set pose to the maximum of those positions.
2031 : *
2032 : * We need not consider ORs or NOTs here; see the comments for
2033 : * TS_execute_locations(). Rechecking the match with TS_execute(),
2034 : * below, will deal with any ensuing imprecision.
2035 : */
80 tgl 2036 GNC 311 : pose = -1;
2037 483 : foreach(lc, locations)
2038 : {
2039 233 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2040 233 : int first = -1;
2041 :
2042 396 : for (int i = 0; i < pdata->npos; i++)
2043 : {
2044 : /* For phrase matches, use the ending lexeme */
2045 335 : int endp = pdata->pos[i];
2046 :
2047 335 : if (endp >= pos)
2048 : {
2049 172 : first = endp;
2050 172 : break;
2051 : }
2052 : }
2053 233 : if (first < 0)
2054 61 : return false; /* no more matches for this term */
2055 172 : if (first > pose)
2056 163 : pose = first;
80 tgl 2057 ECB : }
2058 :
80 tgl 2059 GNC 250 : if (pose < 0)
2060 123 : return false; /* we only get here if empty list */
2061 :
2062 : /*
2063 : * Now, for each AND'ed query term or phrase, find its last occurrence
2064 : * at or before pose; set posb to the minimum of those positions.
2065 : *
2066 : * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
2067 : * posb + 1 below.
2068 : */
2069 127 : posb = INT_MAX - 1;
2070 293 : foreach(lc, locations)
2071 : {
2072 166 : ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
2073 166 : int last = -1;
2074 :
2075 247 : for (int i = pdata->npos - 1; i >= 0; i--)
2076 : {
2077 : /* For phrase matches, use the starting lexeme */
2078 247 : int startp = pdata->pos[i] - pdata->width;
2079 :
2080 247 : if (startp <= pose)
2081 : {
2082 166 : last = startp;
2083 166 : break;
2084 : }
2085 : }
2086 166 : if (last < posb)
2087 136 : posb = last;
2088 : }
2089 :
2090 : /*
2091 : * We could end up with posb to the left of pos, in case some phrase
2092 : * match crosses pos. Try the match starting at pos anyway, since the
2093 : * result of TS_execute_locations is imprecise for phrase matches OR'd
2094 : * with plain matches; that is, if the query is "(A <-> B) | C" then C
2095 : * could match at pos even though the phrase match would have to
2096 : * extend to the left of pos.
2097 : */
2098 127 : posb = Max(posb, pos);
2099 :
2100 : /* This test probably always succeeds, but be paranoid */
2101 127 : if (posb <= pose)
2102 : {
2103 : /*
2104 : * posb .. pose is now the shortest, earliest-after-pos range of
2105 : * lexeme positions containing all the query terms. It will
2106 : * contain all phrase matches, too, except in the corner case
2107 : * described just above.
2108 : *
2109 : * Now convert these lexeme positions to indexes in prs->words[].
2110 : */
2111 127 : int idxb = -1;
2112 127 : int idxe = -1;
2113 :
2114 5812 : for (int i = 0; i < prs->curwords; i++)
2115 : {
2116 5748 : if (prs->words[i].item == NULL)
2117 5306 : continue;
2118 442 : if (idxb < 0 && prs->words[i].pos >= posb)
2119 127 : idxb = i;
2120 442 : if (prs->words[i].pos <= pose)
2121 379 : idxe = i;
2122 : else
2123 63 : break;
2124 : }
2125 :
2126 : /* This test probably always succeeds, but be paranoid */
2127 127 : if (idxb >= 0 && idxe >= idxb)
2128 : {
2129 : /*
2130 : * Finally, check that the selected range satisfies the query.
2131 : * This should succeed in all simple cases; but odd cases
2132 : * involving non-top-level NOT conditions or phrase matches
2133 : * OR'd with other things could fail, since the result of
2134 : * TS_execute_locations doesn't fully represent such things.
2135 : */
2136 : hlCheck ch;
2137 :
2138 127 : ch.words = &(prs->words[idxb]);
2139 127 : ch.len = idxe - idxb + 1;
2140 127 : if (TS_execute(GETQUERY(query), &ch,
2141 : TS_EXEC_EMPTY, checkcondition_HL))
2142 : {
2143 : /* Match! Advance *nextpos and return the word range. */
2144 97 : *nextpos = posb + 1;
2145 97 : *p = idxb;
2146 97 : *q = idxe;
2147 97 : return true;
2148 : }
2149 : }
2150 : }
2151 :
2152 : /*
2153 : * Advance pos and try again. Any later workable match must start
2154 : * beyond posb.
2155 : */
2156 30 : pos = posb + 1;
5710 tgl 2157 ECB : }
2158 : /* Can't get here, but stupider compilers complain if we leave it off */
2159 : return false;
2160 : }
2161 :
1095 2162 : /*
2163 : * Apply suitable highlight marking to words selected by headline selector
2164 : *
2165 : * The words from startpos to endpos inclusive are marked per highlightall
2166 : */
2167 : static void
1095 tgl 2168 GIC 193 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
2169 : int startpos, int endpos)
2170 : {
2171 : int i;
2172 :
5287 teodor 2173 2827 : for (i = startpos; i <= endpos; i++)
5287 teodor 2174 ECB : {
5287 teodor 2175 GIC 2634 : if (prs->words[i].item)
2176 250 : prs->words[i].selected = 1;
1095 tgl 2177 CBC 2634 : if (!highlightall)
2178 : {
5197 teodor 2179 GIC 2511 : if (HLIDREPLACE(prs->words[i].type))
5287 teodor 2180 UIC 0 : prs->words[i].replace = 1;
4382 bruce 2181 GIC 2511 : else if (HLIDSKIP(prs->words[i].type))
5197 teodor 2182 UIC 0 : prs->words[i].skip = 1;
2183 : }
2184 : else
2185 : {
5197 teodor 2186 GIC 123 : if (XMLHLIDSKIP(prs->words[i].type))
5197 teodor 2187 CBC 3 : prs->words[i].skip = 1;
5287 teodor 2188 ECB : }
2189 :
5287 teodor 2190 CBC 2634 : prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2191 : }
2192 193 : }
5287 teodor 2193 ECB :
1095 tgl 2194 : /*
2195 : * split a cover substring into fragments not longer than max_words
2196 : *
2197 : * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2198 : * substring. They are updated to hold the bounds of the next fragment.
2199 : *
2200 : * *curlen and *poslen are set to the fragment's length, in words and
2201 : * interesting words respectively.
2202 : */
4520 peter_e 2203 : static void
5287 teodor 2204 GIC 18 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2205 : int *curlen, int *poslen, int max_words)
2206 : {
2207 : int i;
2208 :
2209 : /*
2210 : * Objective: select a fragment of words between startpos and endpos such
2211 : * that it has at most max_words and both ends have query words. If the
2212 : * startpos and endpos are the endpoints of the cover and the cover has
2213 : * fewer words than max_words, then this function should just return the
1095 tgl 2214 ECB : * cover
5287 teodor 2215 : */
2216 : /* first move startpos to an item */
4382 bruce 2217 GIC 444 : for (i = *startpos; i <= *endpos; i++)
2218 : {
5287 teodor 2219 444 : *startpos = i;
1095 tgl 2220 CBC 444 : if (INTERESTINGWORD(i))
5287 teodor 2221 18 : break;
5287 teodor 2222 ECB : }
2223 : /* cut endpos to have only max_words */
5287 teodor 2224 GIC 18 : *curlen = 0;
2225 18 : *poslen = 0;
4382 bruce 2226 480 : for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2227 : {
5287 teodor 2228 462 : if (!NONWORDTOKEN(prs->words[i].type))
2229 240 : *curlen += 1;
1095 tgl 2230 462 : if (INTERESTINGWORD(i))
5287 teodor 2231 27 : *poslen += 1;
5287 teodor 2232 ECB : }
2233 : /* if the cover was cut then move back endpos to a query item */
5287 teodor 2234 GIC 18 : if (*endpos > i)
2235 : {
2236 6 : *endpos = i;
4382 bruce 2237 420 : for (i = *endpos; i >= *startpos; i--)
2238 : {
5287 teodor 2239 420 : *endpos = i;
1095 tgl 2240 420 : if (INTERESTINGWORD(i))
5287 teodor 2241 6 : break;
2242 414 : if (!NONWORDTOKEN(prs->words[i].type))
2243 204 : *curlen -= 1;
4520 peter_e 2244 ECB : }
2245 : }
5287 teodor 2246 GIC 18 : }
2247 :
2248 : /*
1095 tgl 2249 ECB : * Headline selector used when MaxFragments > 0
2250 : *
2251 : * Note: in this mode, highlightall is disregarded for phrase selection;
2252 : * it only controls presentation details.
2253 : */
2254 : static void
80 tgl 2255 GNC 15 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
2256 : bool highlightall,
4382 bruce 2257 EUB : int shortword, int min_words,
2258 : int max_words, int max_fragments)
5287 teodor 2259 : {
2260 : int32 poslen,
2261 : curlen,
2262 : i,
4382 bruce 2263 ECB : f,
4382 bruce 2264 CBC 15 : num_f = 0;
2265 : int32 stretch,
2266 : maxstretch,
4382 bruce 2267 ECB : posmarker;
2268 :
3940 peter_e 2269 CBC 15 : int32 startpos = 0,
4382 bruce 2270 GIC 15 : endpos = 0,
80 tgl 2271 GNC 15 : nextpos = 0,
4382 bruce 2272 GIC 15 : p = 0,
2273 15 : q = 0;
2274 :
3940 peter_e 2275 15 : int32 numcovers = 0,
4382 bruce 2276 15 : maxcovers = 32;
2277 :
2278 : int32 minI,
2279 : minwords,
2280 : maxitems;
2281 : CoverPos *covers;
5287 teodor 2282 ECB :
5287 teodor 2283 GIC 15 : covers = palloc(maxcovers * sizeof(CoverPos));
2284 :
2285 : /* get all covers */
80 tgl 2286 GNC 27 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
2287 : {
5287 teodor 2288 GIC 12 : startpos = p;
4382 bruce 2289 12 : endpos = q;
2290 :
2291 : /*
2292 : * Break the cover into smaller fragments such that each fragment has
2293 : * at most max_words. Also ensure that each end of each fragment is a
2294 : * query word. This will allow us to stretch the fragment in either
4382 bruce 2295 ECB : * direction
2296 : */
5287 teodor 2297 :
5287 teodor 2298 CBC 30 : while (startpos <= endpos)
5287 teodor 2299 ECB : {
5287 teodor 2300 GIC 18 : get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2301 18 : if (numcovers >= maxcovers)
5287 teodor 2302 ECB : {
5287 teodor 2303 LBC 0 : maxcovers *= 2;
4382 bruce 2304 0 : covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2305 : }
5287 teodor 2306 CBC 18 : covers[numcovers].startpos = startpos;
4382 bruce 2307 18 : covers[numcovers].endpos = endpos;
2308 18 : covers[numcovers].curlen = curlen;
2309 18 : covers[numcovers].poslen = poslen;
1095 tgl 2310 GIC 18 : covers[numcovers].chosen = false;
2311 18 : covers[numcovers].excluded = false;
4382 bruce 2312 CBC 18 : numcovers++;
5287 teodor 2313 GIC 18 : startpos = endpos + 1;
4382 bruce 2314 CBC 18 : endpos = q;
4520 peter_e 2315 ECB : }
5287 teodor 2316 : }
5710 tgl 2317 :
5287 teodor 2318 : /* choose best covers */
5287 teodor 2319 GIC 33 : for (f = 0; f < max_fragments; f++)
2320 : {
5287 teodor 2321 CBC 24 : maxitems = 0;
2929 andres 2322 GIC 24 : minwords = PG_INT32_MAX;
5287 teodor 2323 24 : minI = -1;
2324 :
2325 : /*
2326 : * Choose the cover that contains max items. In case of tie choose the
2327 : * one with smaller number of words.
2328 : */
4382 bruce 2329 57 : for (i = 0; i < numcovers; i++)
5287 teodor 2330 ECB : {
1095 tgl 2331 GIC 33 : if (!covers[i].chosen && !covers[i].excluded &&
2332 24 : (maxitems < covers[i].poslen ||
2333 6 : (maxitems == covers[i].poslen &&
2334 6 : minwords > covers[i].curlen)))
2335 : {
5287 teodor 2336 18 : maxitems = covers[i].poslen;
2337 18 : minwords = covers[i].curlen;
4382 bruce 2338 18 : minI = i;
5287 teodor 2339 ECB : }
2340 : }
2341 : /* if a cover was found mark it */
5287 teodor 2342 GIC 24 : if (minI >= 0)
2343 : {
1095 tgl 2344 CBC 18 : covers[minI].chosen = true;
5287 teodor 2345 ECB : /* adjust the size of cover */
5287 teodor 2346 CBC 18 : startpos = covers[minI].startpos;
4382 bruce 2347 18 : endpos = covers[minI].endpos;
2348 18 : curlen = covers[minI].curlen;
2349 : /* stretch the cover if cover size is lower than max_words */
4520 peter_e 2350 18 : if (curlen < max_words)
5287 teodor 2351 ECB : {
2352 : /* divide the stretch on both sides of cover */
4382 bruce 2353 GIC 18 : maxstretch = (max_words - curlen) / 2;
2354 :
2355 : /*
2356 : * first stretch the startpos stop stretching if 1. we hit the
2357 : * beginning of document 2. exceed maxstretch 3. we hit an
4382 bruce 2358 ECB : * already marked fragment
2359 : */
4382 bruce 2360 GIC 18 : stretch = 0;
5287 teodor 2361 CBC 18 : posmarker = startpos;
5287 teodor 2362 GIC 300 : for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
5287 teodor 2363 ECB : {
5287 teodor 2364 CBC 282 : if (!NONWORDTOKEN(prs->words[i].type))
2365 : {
4382 bruce 2366 GIC 135 : curlen++;
2367 135 : stretch++;
2368 : }
5287 teodor 2369 282 : posmarker = i;
2370 : }
2371 : /* cut back startpos till we find a good endpoint */
1095 tgl 2372 66 : for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
5287 teodor 2373 ECB : {
5287 teodor 2374 GIC 48 : if (!NONWORDTOKEN(prs->words[i].type))
4382 bruce 2375 CBC 18 : curlen--;
5287 teodor 2376 ECB : }
5287 teodor 2377 GIC 18 : startpos = i;
4382 bruce 2378 EUB : /* now stretch the endpos as much as possible */
5287 teodor 2379 GBC 18 : posmarker = endpos;
5287 teodor 2380 GIC 483 : for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
5287 teodor 2381 ECB : {
5287 teodor 2382 CBC 465 : if (!NONWORDTOKEN(prs->words[i].type))
4382 bruce 2383 231 : curlen++;
4520 peter_e 2384 465 : posmarker = i;
5287 teodor 2385 ECB : }
1095 tgl 2386 : /* cut back endpos till we find a good endpoint */
1095 tgl 2387 CBC 45 : for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
5287 teodor 2388 ECB : {
5287 teodor 2389 CBC 27 : if (!NONWORDTOKEN(prs->words[i].type))
4382 bruce 2390 GIC 12 : curlen--;
2391 : }
5287 teodor 2392 18 : endpos = i;
2393 : }
5287 teodor 2394 CBC 18 : covers[minI].startpos = startpos;
4382 bruce 2395 GIC 18 : covers[minI].endpos = endpos;
4382 bruce 2396 CBC 18 : covers[minI].curlen = curlen;
5287 teodor 2397 ECB : /* Mark the chosen fragments (covers) */
1095 tgl 2398 CBC 18 : mark_fragment(prs, highlightall, startpos, endpos);
4382 bruce 2399 GIC 18 : num_f++;
2400 : /* Exclude covers overlapping this one from future consideration */
2401 48 : for (i = 0; i < numcovers; i++)
2402 : {
1095 tgl 2403 30 : if (i != minI &&
1095 tgl 2404 CBC 12 : ((covers[i].startpos >= startpos &&
1095 tgl 2405 GIC 6 : covers[i].startpos <= endpos) ||
1095 tgl 2406 CBC 12 : (covers[i].endpos >= startpos &&
2407 6 : covers[i].endpos <= endpos) ||
2408 12 : (covers[i].startpos < startpos &&
2409 6 : covers[i].endpos > endpos)))
1095 tgl 2410 UIC 0 : covers[i].excluded = true;
5287 teodor 2411 ECB : }
2412 : }
2413 : else
1095 tgl 2414 GIC 6 : break; /* no selectable covers remain */
2415 : }
2416 :
1095 tgl 2417 ECB : /* show the first min_words words if we have not marked anything */
5287 teodor 2418 GIC 15 : if (num_f <= 0)
5287 teodor 2419 ECB : {
3 tgl 2420 GNC 3 : startpos = curlen = 0;
2421 3 : endpos = -1;
5287 teodor 2422 CBC 93 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
5287 teodor 2423 ECB : {
5287 teodor 2424 CBC 90 : if (!NONWORDTOKEN(prs->words[i].type))
5287 teodor 2425 GIC 45 : curlen++;
5287 teodor 2426 CBC 90 : endpos = i;
2427 : }
1095 tgl 2428 GIC 3 : mark_fragment(prs, highlightall, startpos, endpos);
5287 teodor 2429 ECB : }
2430 :
5287 teodor 2431 GIC 15 : pfree(covers);
2432 15 : }
2433 :
2434 : /*
2435 : * Headline selector used when MaxFragments == 0
1095 tgl 2436 ECB : */
5287 teodor 2437 : static void
80 tgl 2438 GNC 172 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
2439 : bool highlightall,
2440 : int shortword, int min_words, int max_words)
5287 teodor 2441 ECB : {
80 tgl 2442 GNC 172 : int nextpos = 0,
2443 172 : p = 0,
5710 tgl 2444 CBC 172 : q = 0;
2445 172 : int bestb = -1,
5710 tgl 2446 GIC 172 : beste = -1;
5710 tgl 2447 CBC 172 : int bestlen = -1;
1095 tgl 2448 GIC 172 : bool bestcover = false;
2449 : int pose,
5710 tgl 2450 ECB : posb,
2451 : poslen,
2452 : curlen;
1095 2453 : bool poscover;
2454 : int i;
5710 2455 :
1095 tgl 2456 GIC 172 : if (!highlightall)
5710 tgl 2457 ECB : {
1095 2458 : /* examine all covers, select a headline using the best one */
80 tgl 2459 GNC 254 : while (hlCover(prs, query, locations, &nextpos, &p, &q))
5710 tgl 2460 ECB : {
1095 2461 : /*
2462 : * Count words (curlen) and interesting words (poslen) within
2463 : * cover, but stop once we reach max_words. This step doesn't
2464 : * consider whether that's a good stopping point. posb and pose
2465 : * are set to the start and end indexes of the possible headline.
2466 : */
5710 tgl 2467 CBC 85 : curlen = 0;
2468 85 : poslen = 0;
1095 tgl 2469 GIC 85 : posb = pose = p;
5710 tgl 2470 CBC 728 : for (i = p; i <= q && curlen < max_words; i++)
2471 : {
2472 643 : if (!NONWORDTOKEN(prs->words[i].type))
2473 364 : curlen++;
1095 2474 643 : if (INTERESTINGWORD(i))
5710 tgl 2475 GIC 145 : poslen++;
5710 tgl 2476 CBC 643 : pose = i;
5710 tgl 2477 ECB : }
2478 :
5710 tgl 2479 CBC 85 : if (curlen < max_words)
2480 : {
1095 tgl 2481 ECB : /*
2482 : * We have room to lengthen the headline, so search forward
2483 : * until it's full or we find a good stopping point. We'll
2484 : * reconsider the word at "q", then move forward.
2485 : */
5710 tgl 2486 CBC 1469 : for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
5710 tgl 2487 ECB : {
1095 tgl 2488 GBC 1456 : if (i > q)
2489 : {
5710 tgl 2490 GIC 1377 : if (!NONWORDTOKEN(prs->words[i].type))
2491 687 : curlen++;
1095 tgl 2492 CBC 1377 : if (INTERESTINGWORD(i))
5710 tgl 2493 GIC 60 : poslen++;
2494 : }
2495 1456 : pose = i;
1095 tgl 2496 CBC 1456 : if (BADENDPOINT(i))
5710 tgl 2497 GIC 972 : continue;
5710 tgl 2498 CBC 484 : if (curlen >= min_words)
2499 66 : break;
5710 tgl 2500 ECB : }
1095 tgl 2501 GIC 79 : if (curlen < min_words)
1095 tgl 2502 ECB : {
2503 : /*
2504 : * Reached end of text and our headline is still shorter
2505 : * than min_words, so try to extend it to the left.
2506 : */
5197 teodor 2507 GIC 183 : for (i = p - 1; i >= 0; i--)
2508 : {
5710 tgl 2509 CBC 182 : if (!NONWORDTOKEN(prs->words[i].type))
2510 91 : curlen++;
1095 tgl 2511 GIC 182 : if (INTERESTINGWORD(i))
5710 2512 3 : poslen++;
4382 bruce 2513 182 : if (curlen >= max_words)
5197 teodor 2514 UIC 0 : break;
1095 tgl 2515 GIC 182 : if (BADENDPOINT(i))
5710 tgl 2516 CBC 118 : continue;
5710 tgl 2517 GIC 64 : if (curlen >= min_words)
2518 12 : break;
2519 : }
5710 tgl 2520 CBC 13 : posb = (i >= 0) ? i : 0;
5710 tgl 2521 ECB : }
2522 : }
2523 : else
1095 2524 : {
2525 : /*
2526 : * Can't make headline longer, so consider making it shorter
2527 : * if needed to avoid a bad endpoint.
2528 : */
2557 teodor 2529 GIC 6 : if (i > q)
2530 3 : i = q;
5710 tgl 2531 15 : for (; curlen > min_words; i--)
2532 : {
1095 2533 15 : if (!BADENDPOINT(i))
1095 tgl 2534 ECB : break;
5710 tgl 2535 GIC 9 : if (!NONWORDTOKEN(prs->words[i].type))
2536 3 : curlen--;
1095 tgl 2537 CBC 9 : if (INTERESTINGWORD(i))
5710 tgl 2538 UIC 0 : poslen--;
1095 tgl 2539 GIC 9 : pose = i - 1;
2540 : }
2541 : }
2542 :
2543 : /*
2544 : * Check whether the proposed headline includes the original
1095 tgl 2545 ECB : * cover; it might not if we trimmed it due to max_words.
2546 : */
1095 tgl 2547 CBC 85 : poscover = (posb <= p && pose >= q);
1095 tgl 2548 ECB :
2549 : /*
2550 : * Adopt this headline if it's better than the last one, giving
2551 : * highest priority to headlines including the cover, then to
2552 : * headlines with more interesting words, then to headlines with
2553 : * good stopping points. (Since bestlen is initially -1, we will
2554 : * certainly adopt the first headline.)
2555 : */
1095 tgl 2556 GIC 85 : if (poscover > bestcover ||
1095 tgl 2557 CBC 39 : (poscover == bestcover && poslen > bestlen) ||
1095 tgl 2558 GIC 36 : (poscover == bestcover && poslen == bestlen &&
2559 6 : !BADENDPOINT(pose) && BADENDPOINT(beste)))
2560 : {
5710 2561 49 : bestb = posb;
2562 49 : beste = pose;
2563 49 : bestlen = poslen;
1095 tgl 2564 CBC 49 : bestcover = poscover;
2565 : }
5710 tgl 2566 ECB : }
2567 :
1095 2568 : /*
2569 : * If we found nothing acceptable, select min_words words starting at
2570 : * the beginning.
2571 : */
5710 tgl 2572 CBC 169 : if (bestlen < 0)
5710 tgl 2573 ECB : {
5710 tgl 2574 CBC 120 : curlen = 0;
3 tgl 2575 GNC 120 : pose = -1;
5710 tgl 2576 CBC 519 : for (i = 0; i < prs->curwords && curlen < min_words; i++)
2577 : {
5710 tgl 2578 GIC 399 : if (!NONWORDTOKEN(prs->words[i].type))
2579 258 : curlen++;
2580 399 : pose = i;
2581 : }
5710 tgl 2582 CBC 120 : bestb = 0;
5710 tgl 2583 GIC 120 : beste = pose;
5710 tgl 2584 ECB : }
2585 : }
2586 : else
2587 : {
1095 2588 : /* highlightall mode: headline is whole document */
5710 tgl 2589 GBC 3 : bestb = 0;
5710 tgl 2590 CBC 3 : beste = prs->curwords - 1;
5710 tgl 2591 ECB : }
2592 :
1095 tgl 2593 CBC 172 : mark_fragment(prs, highlightall, bestb, beste);
5287 teodor 2594 GIC 172 : }
5287 teodor 2595 ECB :
2596 : /*
2597 : * Default parser's prsheadline function
2598 : */
2599 : Datum
5287 teodor 2600 GIC 187 : prsd_headline(PG_FUNCTION_ARGS)
2601 : {
2602 187 : HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2603 187 : List *prsoptions = (List *) PG_GETARG_POINTER(1);
5287 teodor 2604 CBC 187 : TSQuery query = PG_GETARG_TSQUERY(2);
2605 : List *locations;
5287 teodor 2606 ECB :
1095 tgl 2607 : /* default option values: */
4382 bruce 2608 GIC 187 : int min_words = 15;
4382 bruce 2609 CBC 187 : int max_words = 35;
4382 bruce 2610 GIC 187 : int shortword = 3;
5287 teodor 2611 CBC 187 : int max_fragments = 0;
1095 tgl 2612 187 : bool highlightall = false;
5287 teodor 2613 EUB : ListCell *l;
5287 teodor 2614 ECB :
2615 : /* Extract configuration option values */
5287 teodor 2616 GIC 187 : prs->startsel = NULL;
2617 187 : prs->stopsel = NULL;
1095 tgl 2618 187 : prs->fragdelim = NULL;
5287 teodor 2619 364 : foreach(l, prsoptions)
2620 : {
2621 177 : DefElem *defel = (DefElem *) lfirst(l);
5287 teodor 2622 CBC 177 : char *val = defGetString(defel);
2623 :
5287 teodor 2624 GIC 177 : if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
1722 andres 2625 18 : max_words = pg_strtoint32(val);
5287 teodor 2626 159 : else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
1722 andres 2627 18 : min_words = pg_strtoint32(val);
5287 teodor 2628 141 : else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
1722 andres 2629 UIC 0 : shortword = pg_strtoint32(val);
5287 teodor 2630 GIC 141 : else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
1722 andres 2631 CBC 15 : max_fragments = pg_strtoint32(val);
5287 teodor 2632 126 : else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2633 60 : prs->startsel = pstrdup(val);
2634 66 : else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
5287 teodor 2635 GIC 60 : prs->stopsel = pstrdup(val);
5287 teodor 2636 CBC 6 : else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2637 3 : prs->fragdelim = pstrdup(val);
2638 3 : else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
1095 tgl 2639 9 : highlightall = (pg_strcasecmp(val, "1") == 0 ||
1095 tgl 2640 GIC 6 : pg_strcasecmp(val, "on") == 0 ||
2641 3 : pg_strcasecmp(val, "true") == 0 ||
1095 tgl 2642 UIC 0 : pg_strcasecmp(val, "t") == 0 ||
1095 tgl 2643 GIC 6 : pg_strcasecmp(val, "y") == 0 ||
1095 tgl 2644 UIC 0 : pg_strcasecmp(val, "yes") == 0);
2645 : else
5287 teodor 2646 0 : ereport(ERROR,
5287 teodor 2647 ECB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2648 : errmsg("unrecognized headline parameter: \"%s\"",
2649 : defel->defname)));
2650 : }
2651 :
2652 : /* in HighlightAll mode these parameters are ignored */
1095 tgl 2653 GIC 187 : if (!highlightall)
2654 : {
5287 teodor 2655 CBC 184 : if (min_words >= max_words)
5287 teodor 2656 LBC 0 : ereport(ERROR,
2657 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2658 : errmsg("MinWords should be less than MaxWords")));
5287 teodor 2659 CBC 184 : if (min_words <= 0)
5287 teodor 2660 LBC 0 : ereport(ERROR,
2661 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2662 : errmsg("MinWords should be positive")));
5287 teodor 2663 GIC 184 : if (shortword < 0)
5287 teodor 2664 UIC 0 : ereport(ERROR,
2665 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5287 teodor 2666 ECB : errmsg("ShortWord should be >= 0")));
5287 teodor 2667 GIC 184 : if (max_fragments < 0)
5287 teodor 2668 LBC 0 : ereport(ERROR,
5287 teodor 2669 ECB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2670 : errmsg("MaxFragments should be >= 0")));
2671 : }
2672 :
2673 : /* Locate words and phrases matching the query */
3 tgl 2674 GNC 187 : if (query->size > 0)
2675 : {
2676 : hlCheck ch;
2677 :
2678 181 : ch.words = prs->words;
2679 181 : ch.len = prs->curwords;
2680 181 : locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
2681 : checkcondition_HL);
2682 : }
2683 : else
2684 6 : locations = NIL; /* empty query matches nothing */
2685 :
2686 : /* Apply appropriate headline selector */
5287 teodor 2687 CBC 187 : if (max_fragments == 0)
80 tgl 2688 GNC 172 : mark_hl_words(prs, query, locations, highlightall, shortword,
2689 : min_words, max_words);
5287 teodor 2690 ECB : else
80 tgl 2691 GNC 15 : mark_hl_fragments(prs, query, locations, highlightall, shortword,
2692 : min_words, max_words, max_fragments);
2693 :
2694 : /* Fill in default values for string options */
5710 tgl 2695 CBC 187 : if (!prs->startsel)
2696 127 : prs->startsel = pstrdup("<b>");
2697 187 : if (!prs->stopsel)
2698 127 : prs->stopsel = pstrdup("</b>");
5287 teodor 2699 GIC 187 : if (!prs->fragdelim)
5287 teodor 2700 CBC 184 : prs->fragdelim = pstrdup(" ... ");
1095 tgl 2701 ECB :
2702 : /* Caller will need these lengths, too */
5710 tgl 2703 CBC 187 : prs->startsellen = strlen(prs->startsel);
2704 187 : prs->stopsellen = strlen(prs->stopsel);
5287 teodor 2705 187 : prs->fragdelimlen = strlen(prs->fragdelim);
5710 tgl 2706 ECB :
5710 tgl 2707 CBC 187 : PG_RETURN_POINTER(prs);
5710 tgl 2708 EUB : }
|