LCOV - differential code coverage report
Current view: top level - src/backend/tsearch - wparser_def.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 89.3 % 624 557 5 10 52 3 144 66 344 12 170 38
Current Date: 2023-04-08 17:13:01 Functions: 71.2 % 52 37 15 4 3 30 6 1
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 [..60] days: 100.0 % 9 9 8 1
Legend: Lines: hit not hit (60,120] days: 100.0 % 58 58 58 2
(240..) days: 88.0 % 557 490 5 10 52 3 144 343 5 132
Function coverage date bins:
(60,120] days: 100.0 % 3 3 3
(240..) days: 61.8 % 55 34 15 4 30 6

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * wparser_def.c
                                  4                 :  *      Default text search parser
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  *
                                  9                 :  * IDENTIFICATION
                                 10                 :  *    src/backend/tsearch/wparser_def.c
                                 11                 :  *
                                 12                 :  *-------------------------------------------------------------------------
                                 13                 :  */
                                 14                 : 
                                 15                 : #include "postgres.h"
                                 16                 : 
                                 17                 : #include <limits.h>
                                 18                 : 
                                 19                 : #include "catalog/pg_collation.h"
                                 20                 : #include "commands/defrem.h"
                                 21                 : #include "tsearch/ts_locale.h"
                                 22                 : #include "tsearch/ts_public.h"
                                 23                 : #include "tsearch/ts_type.h"
                                 24                 : #include "tsearch/ts_utils.h"
                                 25                 : #include "utils/builtins.h"
                                 26                 : 
                                 27                 : 
                                 28                 : /* Define me to enable tracing of parser behavior */
                                 29                 : /* #define WPARSER_TRACE */
                                 30                 : 
                                 31                 : 
                                 32                 : /* Output token categories */
                                 33                 : 
                                 34                 : #define ASCIIWORD       1
                                 35                 : #define WORD_T          2
                                 36                 : #define NUMWORD         3
                                 37                 : #define EMAIL           4
                                 38                 : #define URL_T           5
                                 39                 : #define HOST            6
                                 40                 : #define SCIENTIFIC      7
                                 41                 : #define VERSIONNUMBER   8
                                 42                 : #define NUMPARTHWORD    9
                                 43                 : #define PARTHWORD       10
                                 44                 : #define ASCIIPARTHWORD  11
                                 45                 : #define SPACE           12
                                 46                 : #define TAG_T           13
                                 47                 : #define PROTOCOL        14
                                 48                 : #define NUMHWORD        15
                                 49                 : #define ASCIIHWORD      16
                                 50                 : #define HWORD           17
                                 51                 : #define URLPATH         18
                                 52                 : #define FILEPATH        19
                                 53                 : #define DECIMAL_T       20
                                 54                 : #define SIGNEDINT       21
                                 55                 : #define UNSIGNEDINT     22
                                 56                 : #define XMLENTITY       23
                                 57                 : 
                                 58                 : #define LASTNUM         23
                                 59                 : 
                                 60                 : static const char *const tok_alias[] = {
                                 61                 :     "",
                                 62                 :     "asciiword",
                                 63                 :     "word",
                                 64                 :     "numword",
                                 65                 :     "email",
                                 66                 :     "url",
                                 67                 :     "host",
                                 68                 :     "sfloat",
                                 69                 :     "version",
                                 70                 :     "hword_numpart",
                                 71                 :     "hword_part",
                                 72                 :     "hword_asciipart",
                                 73                 :     "blank",
                                 74                 :     "tag",
                                 75                 :     "protocol",
                                 76                 :     "numhword",
                                 77                 :     "asciihword",
                                 78                 :     "hword",
                                 79                 :     "url_path",
                                 80                 :     "file",
                                 81                 :     "float",
                                 82                 :     "int",
                                 83                 :     "uint",
                                 84                 :     "entity"
                                 85                 : };
                                 86                 : 
                                 87                 : static const char *const lex_descr[] = {
                                 88                 :     "",
                                 89                 :     "Word, all ASCII",
                                 90                 :     "Word, all letters",
                                 91                 :     "Word, letters and digits",
                                 92                 :     "Email address",
                                 93                 :     "URL",
                                 94                 :     "Host",
                                 95                 :     "Scientific notation",
                                 96                 :     "Version number",
                                 97                 :     "Hyphenated word part, letters and digits",
                                 98                 :     "Hyphenated word part, all letters",
                                 99                 :     "Hyphenated word part, all ASCII",
                                100                 :     "Space symbols",
                                101                 :     "XML tag",
                                102                 :     "Protocol head",
                                103                 :     "Hyphenated word, letters and digits",
                                104                 :     "Hyphenated word, all ASCII",
                                105                 :     "Hyphenated word, all letters",
                                106                 :     "URL path",
                                107                 :     "File or path name",
                                108                 :     "Decimal notation",
                                109                 :     "Signed integer",
                                110                 :     "Unsigned integer",
                                111                 :     "XML entity"
                                112                 : };
                                113                 : 
                                114                 : 
                                115                 : /* Parser states */
                                116                 : 
                                117                 : typedef enum
                                118                 : {
                                119                 :     TPS_Base = 0,
                                120                 :     TPS_InNumWord,
                                121                 :     TPS_InAsciiWord,
                                122                 :     TPS_InWord,
                                123                 :     TPS_InUnsignedInt,
                                124                 :     TPS_InSignedIntFirst,
                                125                 :     TPS_InSignedInt,
                                126                 :     TPS_InSpace,
                                127                 :     TPS_InUDecimalFirst,
                                128                 :     TPS_InUDecimal,
                                129                 :     TPS_InDecimalFirst,
                                130                 :     TPS_InDecimal,
                                131                 :     TPS_InVerVersion,
                                132                 :     TPS_InSVerVersion,
                                133                 :     TPS_InVersionFirst,
                                134                 :     TPS_InVersion,
                                135                 :     TPS_InMantissaFirst,
                                136                 :     TPS_InMantissaSign,
                                137                 :     TPS_InMantissa,
                                138                 :     TPS_InXMLEntityFirst,
                                139                 :     TPS_InXMLEntity,
                                140                 :     TPS_InXMLEntityNumFirst,
                                141                 :     TPS_InXMLEntityNum,
                                142                 :     TPS_InXMLEntityHexNumFirst,
                                143                 :     TPS_InXMLEntityHexNum,
                                144                 :     TPS_InXMLEntityEnd,
                                145                 :     TPS_InTagFirst,
                                146                 :     TPS_InXMLBegin,
                                147                 :     TPS_InTagCloseFirst,
                                148                 :     TPS_InTagName,
                                149                 :     TPS_InTagBeginEnd,
                                150                 :     TPS_InTag,
                                151                 :     TPS_InTagEscapeK,
                                152                 :     TPS_InTagEscapeKK,
                                153                 :     TPS_InTagBackSleshed,
                                154                 :     TPS_InTagEnd,
                                155                 :     TPS_InCommentFirst,
                                156                 :     TPS_InCommentLast,
                                157                 :     TPS_InComment,
                                158                 :     TPS_InCloseCommentFirst,
                                159                 :     TPS_InCloseCommentLast,
                                160                 :     TPS_InCommentEnd,
                                161                 :     TPS_InHostFirstDomain,
                                162                 :     TPS_InHostDomainSecond,
                                163                 :     TPS_InHostDomain,
                                164                 :     TPS_InPortFirst,
                                165                 :     TPS_InPort,
                                166                 :     TPS_InHostFirstAN,
                                167                 :     TPS_InHost,
                                168                 :     TPS_InEmail,
                                169                 :     TPS_InFileFirst,
                                170                 :     TPS_InFileTwiddle,
                                171                 :     TPS_InPathFirst,
                                172                 :     TPS_InPathFirstFirst,
                                173                 :     TPS_InPathSecond,
                                174                 :     TPS_InFile,
                                175                 :     TPS_InFileNext,
                                176                 :     TPS_InURLPathFirst,
                                177                 :     TPS_InURLPathStart,
                                178                 :     TPS_InURLPath,
                                179                 :     TPS_InFURL,
                                180                 :     TPS_InProtocolFirst,
                                181                 :     TPS_InProtocolSecond,
                                182                 :     TPS_InProtocolEnd,
                                183                 :     TPS_InHyphenAsciiWordFirst,
                                184                 :     TPS_InHyphenAsciiWord,
                                185                 :     TPS_InHyphenWordFirst,
                                186                 :     TPS_InHyphenWord,
                                187                 :     TPS_InHyphenNumWordFirst,
                                188                 :     TPS_InHyphenNumWord,
                                189                 :     TPS_InHyphenDigitLookahead,
                                190                 :     TPS_InParseHyphen,
                                191                 :     TPS_InParseHyphenHyphen,
                                192                 :     TPS_InHyphenWordPart,
                                193                 :     TPS_InHyphenAsciiWordPart,
                                194                 :     TPS_InHyphenNumWordPart,
                                195                 :     TPS_InHyphenUnsignedInt,
                                196                 :     TPS_Null                    /* last state (fake value) */
                                197                 : } TParserState;
                                198                 : 
                                199                 : /* forward declaration */
                                200                 : struct TParser;
                                201                 : 
                                202                 : typedef int (*TParserCharTest) (struct TParser *);  /* any p_is* functions
                                203                 :                                                      * except p_iseq */
                                204                 : typedef void (*TParserSpecial) (struct TParser *);  /* special handler for
                                205                 :                                                      * special cases... */
                                206                 : 
                                207                 : typedef struct
                                208                 : {
                                209                 :     TParserCharTest isclass;
                                210                 :     char        c;
                                211                 :     uint16      flags;
                                212                 :     TParserState tostate;
                                213                 :     int         type;
                                214                 :     TParserSpecial special;
                                215                 : } TParserStateActionItem;
                                216                 : 
                                217                 : /* Flag bits in TParserStateActionItem.flags */
                                218                 : #define A_NEXT      0x0000
                                219                 : #define A_BINGO     0x0001
                                220                 : #define A_POP       0x0002
                                221                 : #define A_PUSH      0x0004
                                222                 : #define A_RERUN     0x0008
                                223                 : #define A_CLEAR     0x0010
                                224                 : #define A_MERGE     0x0020
                                225                 : #define A_CLRALL    0x0040
                                226                 : 
                                227                 : typedef struct TParserPosition
                                228                 : {
                                229                 :     int         posbyte;        /* position of parser in bytes */
                                230                 :     int         poschar;        /* position of parser in characters */
                                231                 :     int         charlen;        /* length of current char */
                                232                 :     int         lenbytetoken;   /* length of token-so-far in bytes */
                                233                 :     int         lenchartoken;   /* and in chars */
                                234                 :     TParserState state;
                                235                 :     struct TParserPosition *prev;
                                236                 :     const TParserStateActionItem *pushedAtAction;
                                237                 : } TParserPosition;
                                238                 : 
                                239                 : typedef struct TParser
                                240                 : {
                                241                 :     /* string and position information */
                                242                 :     char       *str;            /* multibyte string */
                                243                 :     int         lenstr;         /* length of mbstring */
                                244                 :     wchar_t    *wstr;           /* wide character string */
                                245                 :     pg_wchar   *pgwstr;         /* wide character string for C-locale */
                                246                 :     bool        usewide;
                                247                 : 
                                248                 :     /* State of parse */
                                249                 :     int         charmaxlen;
                                250                 :     TParserPosition *state;
                                251                 :     bool        ignore;
                                252                 :     bool        wanthost;
                                253                 : 
                                254                 :     /* silly char */
                                255                 :     char        c;
                                256                 : 
                                257                 :     /* out */
                                258                 :     char       *token;
                                259                 :     int         lenbytetoken;
                                260                 :     int         lenchartoken;
                                261                 :     int         type;
                                262                 : } TParser;
                                263                 : 
                                264                 : 
                                265                 : /* forward decls here */
                                266                 : static bool TParserGet(TParser *prs);
                                267                 : 
                                268                 : 
                                269                 : static TParserPosition *
 5624 bruce                     270 CBC        5104 : newTParserPosition(TParserPosition *prev)
                                271                 : {
 5710 tgl                       272            5104 :     TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
                                273                 : 
                                274            5104 :     if (prev)
                                275            2619 :         memcpy(res, prev, sizeof(TParserPosition));
                                276                 :     else
                                277            2485 :         memset(res, 0, sizeof(TParserPosition));
                                278                 : 
                                279            5104 :     res->prev = prev;
                                280                 : 
                                281            5104 :     res->pushedAtAction = NULL;
                                282                 : 
                                283            5104 :     return res;
                                284                 : }
                                285                 : 
                                286                 : static TParser *
                                287            2365 : TParserInit(char *str, int len)
                                288                 : {
                                289            2365 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
                                290                 : 
                                291            2365 :     prs->charmaxlen = pg_database_encoding_max_length();
                                292            2365 :     prs->str = str;
                                293            2365 :     prs->lenstr = len;
                                294                 : 
                                295                 :     /*
                                296                 :      * Use wide char code only when max encoding length > 1.
                                297                 :      */
                                298            2365 :     if (prs->charmaxlen > 1)
                                299                 :     {
 2118                           300            2365 :         pg_locale_t mylocale = 0;   /* TODO */
                                301                 : 
 5710                           302            2365 :         prs->usewide = true;
   23 jdavis                    303            2365 :         if (database_ctype_is_c)
                                304                 :         {
                                305                 :             /*
                                306                 :              * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
                                307                 :              * be different from sizeof(wchar_t)
                                308                 :              */
 4382 bruce                     309             787 :             prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
 5151 teodor                    310             787 :             pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
                                311                 :         }
                                312                 :         else
                                313                 :         {
                                314            1578 :             prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
 4369 tgl                       315            1578 :             char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
                                316                 :                        mylocale);
                                317                 :         }
                                318                 :     }
                                319                 :     else
 5710 tgl                       320 UBC           0 :         prs->usewide = false;
                                321                 : 
 5710 tgl                       322 CBC        2365 :     prs->state = newTParserPosition(NULL);
                                323            2365 :     prs->state->state = TPS_Base;
                                324                 : 
                                325                 : #ifdef WPARSER_TRACE
                                326                 :     fprintf(stderr, "parsing \"%.*s\"\n", len, str);
                                327                 : #endif
                                328                 : 
                                329            2365 :     return prs;
                                330                 : }
                                331                 : 
                                332                 : /*
                                333                 :  * As an alternative to a full TParserInit one can create a
                                334                 :  * TParserCopy which basically is a regular TParser without a private
                                335                 :  * copy of the string - instead it uses the one from another TParser.
                                336                 :  * This is useful because at some places TParsers are created
                                337                 :  * recursively and the repeated copying around of the strings can
                                338                 :  * cause major inefficiency if the source string is long.
                                339                 :  * The new parser starts parsing at the original's current position.
                                340                 :  *
                                341                 :  * Obviously one must not close the original TParser before the copy.
                                342                 :  */
                                343                 : static TParser *
 4863                           344             120 : TParserCopyInit(const TParser *orig)
                                345                 : {
                                346             120 :     TParser    *prs = (TParser *) palloc0(sizeof(TParser));
                                347                 : 
                                348             120 :     prs->charmaxlen = orig->charmaxlen;
                                349             120 :     prs->str = orig->str + orig->state->posbyte;
                                350             120 :     prs->lenstr = orig->lenstr - orig->state->posbyte;
                                351             120 :     prs->usewide = orig->usewide;
                                352                 : 
                                353             120 :     if (orig->pgwstr)
                                354              40 :         prs->pgwstr = orig->pgwstr + orig->state->poschar;
                                355             120 :     if (orig->wstr)
                                356              80 :         prs->wstr = orig->wstr + orig->state->poschar;
                                357                 : 
                                358             120 :     prs->state = newTParserPosition(NULL);
                                359             120 :     prs->state->state = TPS_Base;
                                360                 : 
                                361                 : #ifdef WPARSER_TRACE
                                362                 :     fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
                                363                 : #endif
                                364                 : 
                                365             120 :     return prs;
                                366                 : }
                                367                 : 
                                368                 : 
                                369                 : static void
 5624 bruce                     370            2365 : TParserClose(TParser *prs)
                                371                 : {
 5710 tgl                       372            4730 :     while (prs->state)
                                373                 :     {
                                374            2365 :         TParserPosition *ptr = prs->state->prev;
                                375                 : 
                                376            2365 :         pfree(prs->state);
                                377            2365 :         prs->state = ptr;
                                378                 :     }
                                379                 : 
                                380            2365 :     if (prs->wstr)
                                381            1578 :         pfree(prs->wstr);
 5151 teodor                    382            2365 :     if (prs->pgwstr)
                                383             787 :         pfree(prs->pgwstr);
                                384                 : 
                                385                 : #ifdef WPARSER_TRACE
                                386                 :     fprintf(stderr, "closing parser\n");
                                387                 : #endif
 5710 tgl                       388            2365 :     pfree(prs);
                                389            2365 : }
                                390                 : 
                                391                 : /*
                                392                 :  * Close a parser created with TParserCopyInit
                                393                 :  */
                                394                 : static void
 4863                           395             120 : TParserCopyClose(TParser *prs)
                                396                 : {
                                397             306 :     while (prs->state)
                                398                 :     {
                                399             186 :         TParserPosition *ptr = prs->state->prev;
                                400                 : 
                                401             186 :         pfree(prs->state);
                                402             186 :         prs->state = ptr;
                                403                 :     }
                                404                 : 
                                405                 : #ifdef WPARSER_TRACE
                                406                 :     fprintf(stderr, "closing parser copy\n");
                                407                 : #endif
                                408             120 :     pfree(prs);
                                409             120 : }
                                410                 : 
                                411                 : 
                                412                 : /*
                                413                 :  * Character-type support functions, equivalent to is* macros, but
                                414                 :  * working with any possible encodings and locales. Notes:
                                415                 :  *  - with multibyte encoding and C-locale isw* function may fail
                                416                 :  *    or give wrong result.
                                417                 :  *  - multibyte encoding and C-locale often are used for
                                418                 :  *    Asian languages.
                                419                 :  *  - if locale is C then we use pgwstr instead of wstr.
                                420                 :  */
                                421                 : 
                                422                 : #define p_iswhat(type, nonascii)                                            \
                                423                 :                                                                             \
                                424                 : static int                                                                  \
                                425                 : p_is##type(TParser *prs)                                                    \
                                426                 : {                                                                           \
                                427                 :     Assert(prs->state);                                                      \
                                428                 :     if (prs->usewide)                                                        \
                                429                 :     {                                                                       \
                                430                 :         if (prs->pgwstr)                                                 \
                                431                 :         {                                                                   \
                                432                 :             unsigned int c = *(prs->pgwstr + prs->state->poschar);         \
                                433                 :             if (c > 0x7f)                                                    \
                                434                 :                 return nonascii;                                            \
                                435                 :             return is##type(c);                                             \
                                436                 :         }                                                                   \
                                437                 :         return isw##type(*(prs->wstr + prs->state->poschar));              \
                                438                 :     }                                                                       \
                                439                 :     return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));  \
                                440                 : }                                                                           \
                                441                 :                                                                             \
                                442                 : static int                                                                  \
                                443                 : p_isnot##type(TParser *prs)                                                 \
                                444                 : {                                                                           \
                                445                 :     return !p_is##type(prs);                                                \
                                446                 : }
                                447                 : 
                                448                 : /*
                                449                 :  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
                                450                 :  * an alpha character, but not a member of other char classes.
                                451                 :  */
 2025                           452           12561 : p_iswhat(alnum, 1)
                                453           46886 : p_iswhat(alpha, 1)
                                454           18566 : p_iswhat(digit, 0)
 2025 tgl                       455 UBC           0 : p_iswhat(lower, 0)
                                456               0 : p_iswhat(print, 0)
                                457               0 : p_iswhat(punct, 0)
 2025 tgl                       458 CBC         339 : p_iswhat(space, 0)
 2025 tgl                       459 UBC           0 : p_iswhat(upper, 0)
 2025 tgl                       460 CBC           9 : p_iswhat(xdigit, 0)
                                461                 : 
                                462                 : /* p_iseq should be used only for ascii symbols */
                                463                 : 
                                464                 : static int
 5624 bruce                     465          115684 : p_iseq(TParser *prs, char c)
                                466                 : {
 5710 tgl                       467          115684 :     Assert(prs->state);
                                468          115684 :     return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
                                469                 : }
                                470                 : 
                                471                 : static int
 5624 bruce                     472           50025 : p_isEOF(TParser *prs)
                                473                 : {
 5710 tgl                       474           50025 :     Assert(prs->state);
                                475           50025 :     return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
                                476                 : }
                                477                 : 
                                478                 : static int
 5624 bruce                     479          115684 : p_iseqC(TParser *prs)
                                480                 : {
 5710 tgl                       481          115684 :     return p_iseq(prs, prs->c);
                                482                 : }
                                483                 : 
                                484                 : static int
 5624 bruce                     485 UBC           0 : p_isneC(TParser *prs)
                                486                 : {
 5710 tgl                       487               0 :     return !p_iseq(prs, prs->c);
                                488                 : }
                                489                 : 
                                490                 : static int
 5624 bruce                     491 CBC       36730 : p_isascii(TParser *prs)
                                492                 : {
 5710 tgl                       493           36730 :     return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
                                494                 : }
                                495                 : 
                                496                 : static int
 5624 bruce                     497           36730 : p_isasclet(TParser *prs)
                                498                 : {
 5647 tgl                       499           36730 :     return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
                                500                 : }
                                501                 : 
                                502                 : static int
 4729                           503            1329 : p_isurlchar(TParser *prs)
                                504                 : {
                                505                 :     char        ch;
                                506                 : 
                                507                 :     /* no non-ASCII need apply */
                                508            1329 :     if (prs->state->charlen != 1)
 4729 tgl                       509 UBC           0 :         return 0;
 4729 tgl                       510 CBC        1329 :     ch = *(prs->str + prs->state->posbyte);
                                511                 :     /* no spaces or control characters */
                                512            1329 :     if (ch <= 0x20 || ch >= 0x7F)
                                513             117 :         return 0;
                                514                 :     /* reject characters disallowed by RFC 3986 */
                                515            1212 :     switch (ch)
                                516                 :     {
                                517              12 :         case '"':
                                518                 :         case '<':
                                519                 :         case '>':
                                520                 :         case '\\':
                                521                 :         case '^':
                                522                 :         case '`':
                                523                 :         case '{':
                                524                 :         case '|':
                                525                 :         case '}':
                                526              12 :             return 0;
                                527                 :     }
                                528            1200 :     return 1;
                                529                 : }
                                530                 : 
                                531                 : 
                                532                 : /* deliberately suppress unused-function complaints for the above */
                                533                 : void        _make_compiler_happy(void);
                                534                 : void
 5710 tgl                       535 UBC           0 : _make_compiler_happy(void)
                                536                 : {
                                537               0 :     p_isalnum(NULL);
                                538               0 :     p_isnotalnum(NULL);
                                539               0 :     p_isalpha(NULL);
                                540               0 :     p_isnotalpha(NULL);
                                541               0 :     p_isdigit(NULL);
                                542               0 :     p_isnotdigit(NULL);
                                543               0 :     p_islower(NULL);
                                544               0 :     p_isnotlower(NULL);
                                545               0 :     p_isprint(NULL);
                                546               0 :     p_isnotprint(NULL);
                                547               0 :     p_ispunct(NULL);
                                548               0 :     p_isnotpunct(NULL);
                                549               0 :     p_isspace(NULL);
                                550               0 :     p_isnotspace(NULL);
                                551               0 :     p_isupper(NULL);
                                552               0 :     p_isnotupper(NULL);
                                553               0 :     p_isxdigit(NULL);
                                554               0 :     p_isnotxdigit(NULL);
                                555               0 :     p_isEOF(NULL);
                                556               0 :     p_iseqC(NULL);
                                557               0 :     p_isneC(NULL);
                                558               0 : }
                                559                 : 
                                560                 : 
                                561                 : static void
 5624 bruce                     562 CBC         126 : SpecialTags(TParser *prs)
                                563                 : {
 5643 tgl                       564             126 :     switch (prs->state->lenchartoken)
                                565                 :     {
 2118                           566               3 :         case 8:                 /* </script */
 5643                           567               3 :             if (pg_strncasecmp(prs->token, "</script", 8) == 0)
 5710                           568               3 :                 prs->ignore = false;
                                569               3 :             break;
 2118                           570              12 :         case 7:                 /* <script || </style */
 5643                           571              12 :             if (pg_strncasecmp(prs->token, "</style", 7) == 0)
 5710 tgl                       572 UBC           0 :                 prs->ignore = false;
 5643 tgl                       573 CBC          12 :             else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
 5710                           574               3 :                 prs->ignore = true;
                                575              12 :             break;
 2118                           576               9 :         case 6:                 /* <style */
 5643                           577               9 :             if (pg_strncasecmp(prs->token, "<style", 6) == 0)
 5710 tgl                       578 UBC           0 :                 prs->ignore = true;
 5710 tgl                       579 CBC           9 :             break;
                                580             102 :         default:
                                581             102 :             break;
                                582                 :     }
                                583             126 : }
                                584                 : 
                                585                 : static void
 5624 bruce                     586              66 : SpecialFURL(TParser *prs)
                                587                 : {
 5710 tgl                       588              66 :     prs->wanthost = true;
 5643                           589              66 :     prs->state->posbyte -= prs->state->lenbytetoken;
                                590              66 :     prs->state->poschar -= prs->state->lenchartoken;
 5710                           591              66 : }
                                592                 : 
                                593                 : static void
 5624 bruce                     594              18 : SpecialHyphen(TParser *prs)
                                595                 : {
 5643 tgl                       596              18 :     prs->state->posbyte -= prs->state->lenbytetoken;
                                597              18 :     prs->state->poschar -= prs->state->lenchartoken;
 5710                           598              18 : }
                                599                 : 
                                600                 : static void
 5624 bruce                     601 UBC           0 : SpecialVerVersion(TParser *prs)
                                602                 : {
 5643 tgl                       603               0 :     prs->state->posbyte -= prs->state->lenbytetoken;
                                604               0 :     prs->state->poschar -= prs->state->lenchartoken;
                                605               0 :     prs->state->lenbytetoken = 0;
                                606               0 :     prs->state->lenchartoken = 0;
 5710                           607               0 : }
                                608                 : 
                                609                 : static int
 5624 bruce                     610 CBC         240 : p_isstophost(TParser *prs)
                                611                 : {
 5710 tgl                       612             240 :     if (prs->wanthost)
                                613                 :     {
                                614             102 :         prs->wanthost = false;
                                615             102 :         return 1;
                                616                 :     }
                                617             138 :     return 0;
                                618                 : }
                                619                 : 
                                620                 : static int
 5624 bruce                     621           18031 : p_isignore(TParser *prs)
                                622                 : {
 5710 tgl                       623           18031 :     return (prs->ignore) ? 1 : 0;
                                624                 : }
                                625                 : 
                                626                 : static int
 5624 bruce                     627              45 : p_ishost(TParser *prs)
                                628                 : {
 4382                           629              45 :     TParser    *tmpprs = TParserCopyInit(prs);
 5710 tgl                       630              45 :     int         res = 0;
                                631                 : 
 5143 teodor                    632              45 :     tmpprs->wanthost = true;
                                633                 : 
 5710 tgl                       634              45 :     if (TParserGet(tmpprs) && tmpprs->type == HOST)
                                635                 :     {
 5643                           636              36 :         prs->state->posbyte += tmpprs->lenbytetoken;
                                637              36 :         prs->state->poschar += tmpprs->lenchartoken;
                                638              36 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
                                639              36 :         prs->state->lenchartoken += tmpprs->lenchartoken;
 5710                           640              36 :         prs->state->charlen = tmpprs->state->charlen;
                                641              36 :         res = 1;
                                642                 :     }
 4863                           643              45 :     TParserCopyClose(tmpprs);
                                644                 : 
 5710                           645              45 :     return res;
                                646                 : }
                                647                 : 
                                648                 : static int
 5624 bruce                     649              75 : p_isURLPath(TParser *prs)
                                650                 : {
 4382                           651              75 :     TParser    *tmpprs = TParserCopyInit(prs);
 5710 tgl                       652              75 :     int         res = 0;
                                653                 : 
                                654              75 :     tmpprs->state = newTParserPosition(tmpprs->state);
 4729                           655              75 :     tmpprs->state->state = TPS_InURLPathFirst;
                                656                 : 
                                657              75 :     if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
                                658                 :     {
 5643                           659              66 :         prs->state->posbyte += tmpprs->lenbytetoken;
                                660              66 :         prs->state->poschar += tmpprs->lenchartoken;
                                661              66 :         prs->state->lenbytetoken += tmpprs->lenbytetoken;
                                662              66 :         prs->state->lenchartoken += tmpprs->lenchartoken;
 5710                           663              66 :         prs->state->charlen = tmpprs->state->charlen;
                                664              66 :         res = 1;
                                665                 :     }
 4863                           666              75 :     TParserCopyClose(tmpprs);
                                667                 : 
 5710                           668              75 :     return res;
                                669                 : }
                                670                 : 
                                671                 : /*
                                672                 :  * returns true if current character has zero display length or
                                673                 :  * it's a special sign in several languages. Such characters
                                674                 :  * aren't a word-breaker although they aren't an isalpha.
                                675                 :  * In beginning of word they aren't a part of it.
                                676                 :  */
                                677                 : static int
 5142 teodor                    678            4362 : p_isspecial(TParser *prs)
                                679                 : {
                                680                 :     /*
                                681                 :      * pg_dsplen could return -1 which means error or control character
                                682                 :      */
 4382 bruce                     683            4362 :     if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
 5142 teodor                    684 UBC           0 :         return 1;
                                685                 : 
                                686                 :     /*
                                687                 :      * Unicode Characters in the 'Mark, Spacing Combining' Category That
                                688                 :      * characters are not alpha although they are not breakers of word too.
                                689                 :      * Check that only in utf encoding, because other encodings aren't
                                690                 :      * supported by postgres or even exists.
                                691                 :      */
 4382 bruce                     692 CBC        4362 :     if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
                                693                 :     {
                                694                 :         static const pg_wchar strange_letter[] = {
                                695                 :             /*
                                696                 :              * use binary search, so elements should be ordered
                                697                 :              */
                                698                 :             0x0903,             /* DEVANAGARI SIGN VISARGA */
                                699                 :             0x093E,             /* DEVANAGARI VOWEL SIGN AA */
                                700                 :             0x093F,             /* DEVANAGARI VOWEL SIGN I */
                                701                 :             0x0940,             /* DEVANAGARI VOWEL SIGN II */
                                702                 :             0x0949,             /* DEVANAGARI VOWEL SIGN CANDRA O */
                                703                 :             0x094A,             /* DEVANAGARI VOWEL SIGN SHORT O */
                                704                 :             0x094B,             /* DEVANAGARI VOWEL SIGN O */
                                705                 :             0x094C,             /* DEVANAGARI VOWEL SIGN AU */
                                706                 :             0x0982,             /* BENGALI SIGN ANUSVARA */
                                707                 :             0x0983,             /* BENGALI SIGN VISARGA */
                                708                 :             0x09BE,             /* BENGALI VOWEL SIGN AA */
                                709                 :             0x09BF,             /* BENGALI VOWEL SIGN I */
                                710                 :             0x09C0,             /* BENGALI VOWEL SIGN II */
                                711                 :             0x09C7,             /* BENGALI VOWEL SIGN E */
                                712                 :             0x09C8,             /* BENGALI VOWEL SIGN AI */
                                713                 :             0x09CB,             /* BENGALI VOWEL SIGN O */
                                714                 :             0x09CC,             /* BENGALI VOWEL SIGN AU */
                                715                 :             0x09D7,             /* BENGALI AU LENGTH MARK */
                                716                 :             0x0A03,             /* GURMUKHI SIGN VISARGA */
                                717                 :             0x0A3E,             /* GURMUKHI VOWEL SIGN AA */
                                718                 :             0x0A3F,             /* GURMUKHI VOWEL SIGN I */
                                719                 :             0x0A40,             /* GURMUKHI VOWEL SIGN II */
                                720                 :             0x0A83,             /* GUJARATI SIGN VISARGA */
                                721                 :             0x0ABE,             /* GUJARATI VOWEL SIGN AA */
                                722                 :             0x0ABF,             /* GUJARATI VOWEL SIGN I */
                                723                 :             0x0AC0,             /* GUJARATI VOWEL SIGN II */
                                724                 :             0x0AC9,             /* GUJARATI VOWEL SIGN CANDRA O */
                                725                 :             0x0ACB,             /* GUJARATI VOWEL SIGN O */
                                726                 :             0x0ACC,             /* GUJARATI VOWEL SIGN AU */
                                727                 :             0x0B02,             /* ORIYA SIGN ANUSVARA */
                                728                 :             0x0B03,             /* ORIYA SIGN VISARGA */
                                729                 :             0x0B3E,             /* ORIYA VOWEL SIGN AA */
                                730                 :             0x0B40,             /* ORIYA VOWEL SIGN II */
                                731                 :             0x0B47,             /* ORIYA VOWEL SIGN E */
                                732                 :             0x0B48,             /* ORIYA VOWEL SIGN AI */
                                733                 :             0x0B4B,             /* ORIYA VOWEL SIGN O */
                                734                 :             0x0B4C,             /* ORIYA VOWEL SIGN AU */
                                735                 :             0x0B57,             /* ORIYA AU LENGTH MARK */
                                736                 :             0x0BBE,             /* TAMIL VOWEL SIGN AA */
                                737                 :             0x0BBF,             /* TAMIL VOWEL SIGN I */
                                738                 :             0x0BC1,             /* TAMIL VOWEL SIGN U */
                                739                 :             0x0BC2,             /* TAMIL VOWEL SIGN UU */
                                740                 :             0x0BC6,             /* TAMIL VOWEL SIGN E */
                                741                 :             0x0BC7,             /* TAMIL VOWEL SIGN EE */
                                742                 :             0x0BC8,             /* TAMIL VOWEL SIGN AI */
                                743                 :             0x0BCA,             /* TAMIL VOWEL SIGN O */
                                744                 :             0x0BCB,             /* TAMIL VOWEL SIGN OO */
                                745                 :             0x0BCC,             /* TAMIL VOWEL SIGN AU */
                                746                 :             0x0BD7,             /* TAMIL AU LENGTH MARK */
                                747                 :             0x0C01,             /* TELUGU SIGN CANDRABINDU */
                                748                 :             0x0C02,             /* TELUGU SIGN ANUSVARA */
                                749                 :             0x0C03,             /* TELUGU SIGN VISARGA */
                                750                 :             0x0C41,             /* TELUGU VOWEL SIGN U */
                                751                 :             0x0C42,             /* TELUGU VOWEL SIGN UU */
                                752                 :             0x0C43,             /* TELUGU VOWEL SIGN VOCALIC R */
                                753                 :             0x0C44,             /* TELUGU VOWEL SIGN VOCALIC RR */
                                754                 :             0x0C82,             /* KANNADA SIGN ANUSVARA */
                                755                 :             0x0C83,             /* KANNADA SIGN VISARGA */
                                756                 :             0x0CBE,             /* KANNADA VOWEL SIGN AA */
                                757                 :             0x0CC0,             /* KANNADA VOWEL SIGN II */
                                758                 :             0x0CC1,             /* KANNADA VOWEL SIGN U */
                                759                 :             0x0CC2,             /* KANNADA VOWEL SIGN UU */
                                760                 :             0x0CC3,             /* KANNADA VOWEL SIGN VOCALIC R */
                                761                 :             0x0CC4,             /* KANNADA VOWEL SIGN VOCALIC RR */
                                762                 :             0x0CC7,             /* KANNADA VOWEL SIGN EE */
                                763                 :             0x0CC8,             /* KANNADA VOWEL SIGN AI */
                                764                 :             0x0CCA,             /* KANNADA VOWEL SIGN O */
                                765                 :             0x0CCB,             /* KANNADA VOWEL SIGN OO */
                                766                 :             0x0CD5,             /* KANNADA LENGTH MARK */
                                767                 :             0x0CD6,             /* KANNADA AI LENGTH MARK */
                                768                 :             0x0D02,             /* MALAYALAM SIGN ANUSVARA */
                                769                 :             0x0D03,             /* MALAYALAM SIGN VISARGA */
                                770                 :             0x0D3E,             /* MALAYALAM VOWEL SIGN AA */
                                771                 :             0x0D3F,             /* MALAYALAM VOWEL SIGN I */
                                772                 :             0x0D40,             /* MALAYALAM VOWEL SIGN II */
                                773                 :             0x0D46,             /* MALAYALAM VOWEL SIGN E */
                                774                 :             0x0D47,             /* MALAYALAM VOWEL SIGN EE */
                                775                 :             0x0D48,             /* MALAYALAM VOWEL SIGN AI */
                                776                 :             0x0D4A,             /* MALAYALAM VOWEL SIGN O */
                                777                 :             0x0D4B,             /* MALAYALAM VOWEL SIGN OO */
                                778                 :             0x0D4C,             /* MALAYALAM VOWEL SIGN AU */
                                779                 :             0x0D57,             /* MALAYALAM AU LENGTH MARK */
                                780                 :             0x0D82,             /* SINHALA SIGN ANUSVARAYA */
                                781                 :             0x0D83,             /* SINHALA SIGN VISARGAYA */
                                782                 :             0x0DCF,             /* SINHALA VOWEL SIGN AELA-PILLA */
                                783                 :             0x0DD0,             /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
                                784                 :             0x0DD1,             /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
                                785                 :             0x0DD8,             /* SINHALA VOWEL SIGN GAETTA-PILLA */
                                786                 :             0x0DD9,             /* SINHALA VOWEL SIGN KOMBUVA */
                                787                 :             0x0DDA,             /* SINHALA VOWEL SIGN DIGA KOMBUVA */
                                788                 :             0x0DDB,             /* SINHALA VOWEL SIGN KOMBU DEKA */
                                789                 :             0x0DDC,             /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
                                790                 :             0x0DDD,             /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
                                791                 :                                  * AELA-PILLA */
                                792                 :             0x0DDE,             /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
                                793                 :             0x0DDF,             /* SINHALA VOWEL SIGN GAYANUKITTA */
                                794                 :             0x0DF2,             /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
                                795                 :             0x0DF3,             /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
                                796                 :             0x0F3E,             /* TIBETAN SIGN YAR TSHES */
                                797                 :             0x0F3F,             /* TIBETAN SIGN MAR TSHES */
                                798                 :             0x0F7F,             /* TIBETAN SIGN RNAM BCAD */
                                799                 :             0x102B,             /* MYANMAR VOWEL SIGN TALL AA */
                                800                 :             0x102C,             /* MYANMAR VOWEL SIGN AA */
                                801                 :             0x1031,             /* MYANMAR VOWEL SIGN E */
                                802                 :             0x1038,             /* MYANMAR SIGN VISARGA */
                                803                 :             0x103B,             /* MYANMAR CONSONANT SIGN MEDIAL YA */
                                804                 :             0x103C,             /* MYANMAR CONSONANT SIGN MEDIAL RA */
                                805                 :             0x1056,             /* MYANMAR VOWEL SIGN VOCALIC R */
                                806                 :             0x1057,             /* MYANMAR VOWEL SIGN VOCALIC RR */
                                807                 :             0x1062,             /* MYANMAR VOWEL SIGN SGAW KAREN EU */
                                808                 :             0x1063,             /* MYANMAR TONE MARK SGAW KAREN HATHI */
                                809                 :             0x1064,             /* MYANMAR TONE MARK SGAW KAREN KE PHO */
                                810                 :             0x1067,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
                                811                 :             0x1068,             /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
                                812                 :             0x1069,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
                                813                 :             0x106A,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
                                814                 :             0x106B,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
                                815                 :             0x106C,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
                                816                 :             0x106D,             /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
                                817                 :             0x1083,             /* MYANMAR VOWEL SIGN SHAN AA */
                                818                 :             0x1084,             /* MYANMAR VOWEL SIGN SHAN E */
                                819                 :             0x1087,             /* MYANMAR SIGN SHAN TONE-2 */
                                820                 :             0x1088,             /* MYANMAR SIGN SHAN TONE-3 */
                                821                 :             0x1089,             /* MYANMAR SIGN SHAN TONE-5 */
                                822                 :             0x108A,             /* MYANMAR SIGN SHAN TONE-6 */
                                823                 :             0x108B,             /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
                                824                 :             0x108C,             /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
                                825                 :             0x108F,             /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
                                826                 :             0x17B6,             /* KHMER VOWEL SIGN AA */
                                827                 :             0x17BE,             /* KHMER VOWEL SIGN OE */
                                828                 :             0x17BF,             /* KHMER VOWEL SIGN YA */
                                829                 :             0x17C0,             /* KHMER VOWEL SIGN IE */
                                830                 :             0x17C1,             /* KHMER VOWEL SIGN E */
                                831                 :             0x17C2,             /* KHMER VOWEL SIGN AE */
                                832                 :             0x17C3,             /* KHMER VOWEL SIGN AI */
                                833                 :             0x17C4,             /* KHMER VOWEL SIGN OO */
                                834                 :             0x17C5,             /* KHMER VOWEL SIGN AU */
                                835                 :             0x17C7,             /* KHMER SIGN REAHMUK */
                                836                 :             0x17C8,             /* KHMER SIGN YUUKALEAPINTU */
                                837                 :             0x1923,             /* LIMBU VOWEL SIGN EE */
                                838                 :             0x1924,             /* LIMBU VOWEL SIGN AI */
                                839                 :             0x1925,             /* LIMBU VOWEL SIGN OO */
                                840                 :             0x1926,             /* LIMBU VOWEL SIGN AU */
                                841                 :             0x1929,             /* LIMBU SUBJOINED LETTER YA */
                                842                 :             0x192A,             /* LIMBU SUBJOINED LETTER RA */
                                843                 :             0x192B,             /* LIMBU SUBJOINED LETTER WA */
                                844                 :             0x1930,             /* LIMBU SMALL LETTER KA */
                                845                 :             0x1931,             /* LIMBU SMALL LETTER NGA */
                                846                 :             0x1933,             /* LIMBU SMALL LETTER TA */
                                847                 :             0x1934,             /* LIMBU SMALL LETTER NA */
                                848                 :             0x1935,             /* LIMBU SMALL LETTER PA */
                                849                 :             0x1936,             /* LIMBU SMALL LETTER MA */
                                850                 :             0x1937,             /* LIMBU SMALL LETTER RA */
                                851                 :             0x1938,             /* LIMBU SMALL LETTER LA */
                                852                 :             0x19B0,             /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
                                853                 :             0x19B1,             /* NEW TAI LUE VOWEL SIGN AA */
                                854                 :             0x19B2,             /* NEW TAI LUE VOWEL SIGN II */
                                855                 :             0x19B3,             /* NEW TAI LUE VOWEL SIGN U */
                                856                 :             0x19B4,             /* NEW TAI LUE VOWEL SIGN UU */
                                857                 :             0x19B5,             /* NEW TAI LUE VOWEL SIGN E */
                                858                 :             0x19B6,             /* NEW TAI LUE VOWEL SIGN AE */
                                859                 :             0x19B7,             /* NEW TAI LUE VOWEL SIGN O */
                                860                 :             0x19B8,             /* NEW TAI LUE VOWEL SIGN OA */
                                861                 :             0x19B9,             /* NEW TAI LUE VOWEL SIGN UE */
                                862                 :             0x19BA,             /* NEW TAI LUE VOWEL SIGN AY */
                                863                 :             0x19BB,             /* NEW TAI LUE VOWEL SIGN AAY */
                                864                 :             0x19BC,             /* NEW TAI LUE VOWEL SIGN UY */
                                865                 :             0x19BD,             /* NEW TAI LUE VOWEL SIGN OY */
                                866                 :             0x19BE,             /* NEW TAI LUE VOWEL SIGN OAY */
                                867                 :             0x19BF,             /* NEW TAI LUE VOWEL SIGN UEY */
                                868                 :             0x19C0,             /* NEW TAI LUE VOWEL SIGN IY */
                                869                 :             0x19C8,             /* NEW TAI LUE TONE MARK-1 */
                                870                 :             0x19C9,             /* NEW TAI LUE TONE MARK-2 */
                                871                 :             0x1A19,             /* BUGINESE VOWEL SIGN E */
                                872                 :             0x1A1A,             /* BUGINESE VOWEL SIGN O */
                                873                 :             0x1A1B,             /* BUGINESE VOWEL SIGN AE */
                                874                 :             0x1B04,             /* BALINESE SIGN BISAH */
                                875                 :             0x1B35,             /* BALINESE VOWEL SIGN TEDUNG */
                                876                 :             0x1B3B,             /* BALINESE VOWEL SIGN RA REPA TEDUNG */
                                877                 :             0x1B3D,             /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
                                878                 :             0x1B3E,             /* BALINESE VOWEL SIGN TALING */
                                879                 :             0x1B3F,             /* BALINESE VOWEL SIGN TALING REPA */
                                880                 :             0x1B40,             /* BALINESE VOWEL SIGN TALING TEDUNG */
                                881                 :             0x1B41,             /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
                                882                 :             0x1B43,             /* BALINESE VOWEL SIGN PEPET TEDUNG */
                                883                 :             0x1B44,             /* BALINESE ADEG ADEG */
                                884                 :             0x1B82,             /* SUNDANESE SIGN PANGWISAD */
                                885                 :             0x1BA1,             /* SUNDANESE CONSONANT SIGN PAMINGKAL */
                                886                 :             0x1BA6,             /* SUNDANESE VOWEL SIGN PANAELAENG */
                                887                 :             0x1BA7,             /* SUNDANESE VOWEL SIGN PANOLONG */
                                888                 :             0x1BAA,             /* SUNDANESE SIGN PAMAAEH */
                                889                 :             0x1C24,             /* LEPCHA SUBJOINED LETTER YA */
                                890                 :             0x1C25,             /* LEPCHA SUBJOINED LETTER RA */
                                891                 :             0x1C26,             /* LEPCHA VOWEL SIGN AA */
                                892                 :             0x1C27,             /* LEPCHA VOWEL SIGN I */
                                893                 :             0x1C28,             /* LEPCHA VOWEL SIGN O */
                                894                 :             0x1C29,             /* LEPCHA VOWEL SIGN OO */
                                895                 :             0x1C2A,             /* LEPCHA VOWEL SIGN U */
                                896                 :             0x1C2B,             /* LEPCHA VOWEL SIGN UU */
                                897                 :             0x1C34,             /* LEPCHA CONSONANT SIGN NYIN-DO */
                                898                 :             0x1C35,             /* LEPCHA CONSONANT SIGN KANG */
                                899                 :             0xA823,             /* SYLOTI NAGRI VOWEL SIGN A */
                                900                 :             0xA824,             /* SYLOTI NAGRI VOWEL SIGN I */
                                901                 :             0xA827,             /* SYLOTI NAGRI VOWEL SIGN OO */
                                902                 :             0xA880,             /* SAURASHTRA SIGN ANUSVARA */
                                903                 :             0xA881,             /* SAURASHTRA SIGN VISARGA */
                                904                 :             0xA8B4,             /* SAURASHTRA CONSONANT SIGN HAARU */
                                905                 :             0xA8B5,             /* SAURASHTRA VOWEL SIGN AA */
                                906                 :             0xA8B6,             /* SAURASHTRA VOWEL SIGN I */
                                907                 :             0xA8B7,             /* SAURASHTRA VOWEL SIGN II */
                                908                 :             0xA8B8,             /* SAURASHTRA VOWEL SIGN U */
                                909                 :             0xA8B9,             /* SAURASHTRA VOWEL SIGN UU */
                                910                 :             0xA8BA,             /* SAURASHTRA VOWEL SIGN VOCALIC R */
                                911                 :             0xA8BB,             /* SAURASHTRA VOWEL SIGN VOCALIC RR */
                                912                 :             0xA8BC,             /* SAURASHTRA VOWEL SIGN VOCALIC L */
                                913                 :             0xA8BD,             /* SAURASHTRA VOWEL SIGN VOCALIC LL */
                                914                 :             0xA8BE,             /* SAURASHTRA VOWEL SIGN E */
                                915                 :             0xA8BF,             /* SAURASHTRA VOWEL SIGN EE */
                                916                 :             0xA8C0,             /* SAURASHTRA VOWEL SIGN AI */
                                917                 :             0xA8C1,             /* SAURASHTRA VOWEL SIGN O */
                                918                 :             0xA8C2,             /* SAURASHTRA VOWEL SIGN OO */
                                919                 :             0xA8C3,             /* SAURASHTRA VOWEL SIGN AU */
                                920                 :             0xA952,             /* REJANG CONSONANT SIGN H */
                                921                 :             0xA953,             /* REJANG VIRAMA */
                                922                 :             0xAA2F,             /* CHAM VOWEL SIGN O */
                                923                 :             0xAA30,             /* CHAM VOWEL SIGN AI */
                                924                 :             0xAA33,             /* CHAM CONSONANT SIGN YA */
                                925                 :             0xAA34,             /* CHAM CONSONANT SIGN RA */
                                926                 :             0xAA4D              /* CHAM CONSONANT SIGN FINAL H */
                                927                 :         };
 3368 tgl                       928            4362 :         const pg_wchar *StopLow = strange_letter,
 4382 bruce                     929            4362 :                    *StopHigh = strange_letter + lengthof(strange_letter),
                                930                 :                    *StopMiddle;
                                931                 :         pg_wchar    c;
                                932                 : 
                                933            4362 :         if (prs->pgwstr)
 5142 teodor                    934            1454 :             c = *(prs->pgwstr + prs->state->poschar);
                                935                 :         else
                                936            2908 :             c = (pg_wchar) *(prs->wstr + prs->state->poschar);
                                937                 : 
 4382 bruce                     938           39258 :         while (StopLow < StopHigh)
                                939                 :         {
 5142 teodor                    940           34896 :             StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
 4382 bruce                     941           34896 :             if (*StopMiddle == c)
 5142 teodor                    942 UBC           0 :                 return 1;
 4382 bruce                     943 CBC       34896 :             else if (*StopMiddle < c)
 5142 teodor                    944 UBC           0 :                 StopLow = StopMiddle + 1;
                                945                 :             else
 5142 teodor                    946 CBC       34896 :                 StopHigh = StopMiddle;
                                947                 :         }
                                948                 :     }
                                949                 : 
                                950            4362 :     return 0;
                                951                 : }
                                952                 : 
                                953                 : /*
                                954                 :  * Table of state/action of parser
                                955                 :  */
                                956                 : 
                                957                 : static const TParserStateActionItem actionTPS_Base[] = {
                                958                 :     {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
                                959                 :     {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
                                960                 :     {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
                                961                 :     {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
                                962                 :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
                                963                 :     {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
                                964                 :     {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
                                965                 :     {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
                                966                 :     {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
                                967                 :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
                                968                 :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
                                969                 :     {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
                                970                 :     {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
                                971                 : };
                                972                 : 
                                973                 : 
                                974                 : static const TParserStateActionItem actionTPS_InNumWord[] = {
                                975                 :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
                                976                 :     {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                                977                 :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                                978                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                                979                 :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
                                980                 :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
                                981                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
                                982                 :     {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
                                983                 : };
                                984                 : 
                                985                 : static const TParserStateActionItem actionTPS_InAsciiWord[] = {
                                986                 :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
                                987                 :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
                                988                 :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
                                989                 :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
                                990                 :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                                991                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
                                992                 :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                                993                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                                994                 :     {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
                                995                 :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
                                996                 :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
                                997                 :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                                998                 :     {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
                                999                 :     {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
                               1000                 :     {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
                               1001                 : };
                               1002                 : 
                               1003                 : static const TParserStateActionItem actionTPS_InWord[] = {
                               1004                 :     {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
                               1005                 :     {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
                               1006                 :     {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
                               1007                 :     {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                               1008                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
                               1009                 :     {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
                               1010                 : };
                               1011                 : 
                               1012                 : static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
                               1013                 :     {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
                               1014                 :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
                               1015                 :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
                               1016                 :     {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
                               1017                 :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1018                 :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1019                 :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1020                 :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1021                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                               1022                 :     {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
                               1023                 :     {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                               1024                 :     {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
                               1025                 :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
                               1026                 :     {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
                               1027                 : };
                               1028                 : 
                               1029                 : static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
                               1030                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1031                 :     {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
                               1032                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1033                 : };
                               1034                 : 
                               1035                 : static const TParserStateActionItem actionTPS_InSignedInt[] = {
                               1036                 :     {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
                               1037                 :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
                               1038                 :     {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
                               1039                 :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1040                 :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1041                 :     {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
                               1042                 : };
                               1043                 : 
                               1044                 : static const TParserStateActionItem actionTPS_InSpace[] = {
                               1045                 :     {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
                               1046                 :     {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
                               1047                 :     {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
                               1048                 :     {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
                               1049                 :     {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
                               1050                 :     {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
                               1051                 :     {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
                               1052                 :     {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
                               1053                 :     {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
                               1054                 : };
                               1055                 : 
                               1056                 : static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
                               1057                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1058                 :     {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
                               1059                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1060                 : };
                               1061                 : 
                               1062                 : static const TParserStateActionItem actionTPS_InUDecimal[] = {
                               1063                 :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
                               1064                 :     {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
                               1065                 :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
                               1066                 :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1067                 :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1068                 :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
                               1069                 : };
                               1070                 : 
                               1071                 : static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
                               1072                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1073                 :     {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
                               1074                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1075                 : };
                               1076                 : 
                               1077                 : static const TParserStateActionItem actionTPS_InDecimal[] = {
                               1078                 :     {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
                               1079                 :     {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
                               1080                 :     {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
                               1081                 :     {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1082                 :     {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
                               1083                 :     {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
                               1084                 : };
                               1085                 : 
                               1086                 : static const TParserStateActionItem actionTPS_InVerVersion[] = {
                               1087                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1088                 :     {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
                               1089                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1090                 : };
                               1091                 : 
                               1092                 : static const TParserStateActionItem actionTPS_InSVerVersion[] = {
                               1093                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1094                 :     {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
                               1095                 :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
                               1096                 : };
                               1097                 : 
                               1098                 : 
                               1099                 : static const TParserStateActionItem actionTPS_InVersionFirst[] = {
                               1100                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1101                 :     {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
                               1102                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1103                 : };
                               1104                 : 
                               1105                 : static const TParserStateActionItem actionTPS_InVersion[] = {
                               1106                 :     {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
                               1107                 :     {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
                               1108                 :     {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
                               1109                 :     {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
                               1110                 : };
                               1111                 : 
                               1112                 : static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
                               1113                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1114                 :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
                               1115                 :     {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
                               1116                 :     {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
                               1117                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1118                 : };
                               1119                 : 
                               1120                 : static const TParserStateActionItem actionTPS_InMantissaSign[] = {
                               1121                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1122                 :     {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
                               1123                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1124                 : };
                               1125                 : 
                               1126                 : static const TParserStateActionItem actionTPS_InMantissa[] = {
                               1127                 :     {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
                               1128                 :     {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
                               1129                 :     {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
                               1130                 : };
                               1131                 : 
                               1132                 : static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
                               1133                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1134                 :     {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
                               1135                 :     {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1136                 :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1137                 :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1138                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1139                 : };
                               1140                 : 
                               1141                 : static const TParserStateActionItem actionTPS_InXMLEntity[] = {
                               1142                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1143                 :     {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1144                 :     {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1145                 :     {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1146                 :     {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1147                 :     {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
                               1148                 :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
                               1149                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1150                 : };
                               1151                 : 
                               1152                 : static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
                               1153                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1154                 :     {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
                               1155                 :     {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
                               1156                 :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
                               1157                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1158                 : };
                               1159                 : 
                               1160                 : static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
                               1161                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1162                 :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
                               1163                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1164                 : };
                               1165                 : 
                               1166                 : static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
                               1167                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1168                 :     {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
                               1169                 :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
                               1170                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1171                 : };
                               1172                 : 
                               1173                 : static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
                               1174                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1175                 :     {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
                               1176                 :     {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
                               1177                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1178                 : };
                               1179                 : 
                               1180                 : static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
                               1181                 :     {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
                               1182                 : };
                               1183                 : 
                               1184                 : static const TParserStateActionItem actionTPS_InTagFirst[] = {
                               1185                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1186                 :     {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
                               1187                 :     {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
                               1188                 :     {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
                               1189                 :     {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
                               1190                 :     {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
                               1191                 :     {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
                               1192                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1193                 : };
                               1194                 : 
                               1195                 : static const TParserStateActionItem actionTPS_InXMLBegin[] = {
                               1196                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1197                 :     /* <?xml ... */
                               1198                 :     /* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
                               1199                 :     {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
                               1200                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1201                 : };
                               1202                 : 
                               1203                 : static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
                               1204                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1205                 :     {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
                               1206                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1207                 : };
                               1208                 : 
                               1209                 : static const TParserStateActionItem actionTPS_InTagName[] = {
                               1210                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1211                 :     /* <br/> case */
                               1212                 :     {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
                               1213                 :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
                               1214                 :     {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
                               1215                 :     {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
                               1216                 :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
                               1217                 :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
                               1218                 :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
                               1219                 :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
                               1220                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1221                 : };
                               1222                 : 
                               1223                 : static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
                               1224                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1225                 :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
                               1226                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1227                 : };
                               1228                 : 
                               1229                 : static const TParserStateActionItem actionTPS_InTag[] = {
                               1230                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1231                 :     {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
                               1232                 :     {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
                               1233                 :     {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
                               1234                 :     {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
                               1235                 :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
                               1236                 :     {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
                               1237                 :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
                               1238                 :     {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
                               1239                 :     {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
                               1240                 :     {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
                               1241                 :     {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
                               1242                 :     {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
                               1243                 :     {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
                               1244                 :     {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
                               1245                 :     {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
                               1246                 :     {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
                               1247                 :     {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
                               1248                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1249                 : };
                               1250                 : 
                               1251                 : static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
                               1252                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1253                 :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
                               1254                 :     {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
                               1255                 :     {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
                               1256                 : };
                               1257                 : 
                               1258                 : static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
                               1259                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1260                 :     {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
                               1261                 :     {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
                               1262                 :     {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
                               1263                 : };
                               1264                 : 
                               1265                 : static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
                               1266                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1267                 :     {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
                               1268                 : };
                               1269                 : 
                               1270                 : static const TParserStateActionItem actionTPS_InTagEnd[] = {
                               1271                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
                               1272                 : };
                               1273                 : 
                               1274                 : static const TParserStateActionItem actionTPS_InCommentFirst[] = {
                               1275                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1276                 :     {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
                               1277                 :     /* <!DOCTYPE ...> */
                               1278                 :     {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
                               1279                 :     {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
                               1280                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1281                 : };
                               1282                 : 
                               1283                 : static const TParserStateActionItem actionTPS_InCommentLast[] = {
                               1284                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1285                 :     {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
                               1286                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1287                 : };
                               1288                 : 
                               1289                 : static const TParserStateActionItem actionTPS_InComment[] = {
                               1290                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1291                 :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
                               1292                 :     {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
                               1293                 : };
                               1294                 : 
                               1295                 : static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
                               1296                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1297                 :     {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
                               1298                 :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
                               1299                 : };
                               1300                 : 
                               1301                 : static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
                               1302                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1303                 :     {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
                               1304                 :     {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
                               1305                 :     {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
                               1306                 : };
                               1307                 : 
                               1308                 : static const TParserStateActionItem actionTPS_InCommentEnd[] = {
                               1309                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
                               1310                 : };
                               1311                 : 
                               1312                 : static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
                               1313                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1314                 :     {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
                               1315                 :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
                               1316                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1317                 : };
                               1318                 : 
                               1319                 : static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
                               1320                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1321                 :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
                               1322                 :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
                               1323                 :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1324                 :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1325                 :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
                               1326                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                               1327                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1328                 : };
                               1329                 : 
                               1330                 : static const TParserStateActionItem actionTPS_InHostDomain[] = {
                               1331                 :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
                               1332                 :     {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
                               1333                 :     {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
                               1334                 :     {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
                               1335                 :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1336                 :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1337                 :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
                               1338                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                               1339                 :     {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
                               1340                 :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
                               1341                 :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
                               1342                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
                               1343                 : };
                               1344                 : 
                               1345                 : static const TParserStateActionItem actionTPS_InPortFirst[] = {
                               1346                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1347                 :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
                               1348                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1349                 : };
                               1350                 : 
                               1351                 : static const TParserStateActionItem actionTPS_InPort[] = {
                               1352                 :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
                               1353                 :     {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
                               1354                 :     {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
                               1355                 :     {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
                               1356                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
                               1357                 : };
                               1358                 : 
                               1359                 : static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
                               1360                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1361                 :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
                               1362                 :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
                               1363                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1364                 : };
                               1365                 : 
                               1366                 : static const TParserStateActionItem actionTPS_InHost[] = {
                               1367                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1368                 :     {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
                               1369                 :     {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
                               1370                 :     {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
                               1371                 :     {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
                               1372                 :     {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1373                 :     {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
                               1374                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1375                 : };
                               1376                 : 
                               1377                 : static const TParserStateActionItem actionTPS_InEmail[] = {
                               1378                 :     {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
                               1379                 :     {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
                               1380                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1381                 : };
                               1382                 : 
                               1383                 : static const TParserStateActionItem actionTPS_InFileFirst[] = {
                               1384                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1385                 :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1386                 :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1387                 :     {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
                               1388                 :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
                               1389                 :     {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
                               1390                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1391                 : };
                               1392                 : 
                               1393                 : static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
                               1394                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1395                 :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1396                 :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1397                 :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
                               1398                 :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
                               1399                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1400                 : };
                               1401                 : 
                               1402                 : static const TParserStateActionItem actionTPS_InPathFirst[] = {
                               1403                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1404                 :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1405                 :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1406                 :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
                               1407                 :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
                               1408                 :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
                               1409                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1410                 : };
                               1411                 : 
                               1412                 : static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
                               1413                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1414                 :     {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
                               1415                 :     {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
                               1416                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1417                 : };
                               1418                 : 
                               1419                 : static const TParserStateActionItem actionTPS_InPathSecond[] = {
                               1420                 :     {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
                               1421                 :     {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
                               1422                 :     {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
                               1423                 :     {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
                               1424                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1425                 : };
                               1426                 : 
                               1427                 : static const TParserStateActionItem actionTPS_InFile[] = {
                               1428                 :     {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
                               1429                 :     {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1430                 :     {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
                               1431                 :     {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
                               1432                 :     {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
                               1433                 :     {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
                               1434                 :     {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
                               1435                 :     {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
                               1436                 : };
                               1437                 : 
                               1438                 : static const TParserStateActionItem actionTPS_InFileNext[] = {
                               1439                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1440                 :     {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
                               1441                 :     {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
                               1442                 :     {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
                               1443                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1444                 : };
                               1445                 : 
                               1446                 : static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
                               1447                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1448                 :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
                               1449                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL},
                               1450                 : };
                               1451                 : 
                               1452                 : static const TParserStateActionItem actionTPS_InURLPathStart[] = {
                               1453                 :     {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
                               1454                 : };
                               1455                 : 
                               1456                 : static const TParserStateActionItem actionTPS_InURLPath[] = {
                               1457                 :     {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
                               1458                 :     {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
                               1459                 :     {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
                               1460                 : };
                               1461                 : 
                               1462                 : static const TParserStateActionItem actionTPS_InFURL[] = {
                               1463                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1464                 :     {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
                               1465                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1466                 : };
                               1467                 : 
                               1468                 : static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
                               1469                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1470                 :     {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
                               1471                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1472                 : };
                               1473                 : 
                               1474                 : static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
                               1475                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1476                 :     {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
                               1477                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1478                 : };
                               1479                 : 
                               1480                 : static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
                               1481                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
                               1482                 : };
                               1483                 : 
                               1484                 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
                               1485                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1486                 :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
                               1487                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1488                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
                               1489                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1490                 : };
                               1491                 : 
                               1492                 : static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
                               1493                 :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
                               1494                 :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
                               1495                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1496                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1497                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1498                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
                               1499                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
                               1500                 : };
                               1501                 : 
                               1502                 : static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
                               1503                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1504                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1505                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
                               1506                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1507                 : };
                               1508                 : 
                               1509                 : static const TParserStateActionItem actionTPS_InHyphenWord[] = {
                               1510                 :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
                               1511                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1512                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
                               1513                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1514                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
                               1515                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
                               1516                 : };
                               1517                 : 
                               1518                 : static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
                               1519                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1520                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1521                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
                               1522                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1523                 : };
                               1524                 : 
                               1525                 : static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
                               1526                 :     {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
                               1527                 :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1528                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1529                 :     {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
                               1530                 :     {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
                               1531                 : };
                               1532                 : 
                               1533                 : static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
                               1534                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1535                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
                               1536                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1537                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
                               1538                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1539                 : };
                               1540                 : 
                               1541                 : static const TParserStateActionItem actionTPS_InParseHyphen[] = {
                               1542                 :     {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
                               1543                 :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
                               1544                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
                               1545                 :     {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
                               1546                 :     {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
                               1547                 :     {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
                               1548                 : };
                               1549                 : 
                               1550                 : static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
                               1551                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1552                 :     {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
                               1553                 :     {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
                               1554                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1555                 : };
                               1556                 : 
                               1557                 : static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
                               1558                 :     {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
                               1559                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
                               1560                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
                               1561                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
                               1562                 :     {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
                               1563                 : };
                               1564                 : 
                               1565                 : static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
                               1566                 :     {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
                               1567                 :     {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
                               1568                 :     {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
                               1569                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
                               1570                 :     {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
                               1571                 :     {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
                               1572                 : };
                               1573                 : 
                               1574                 : static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
                               1575                 :     {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
                               1576                 :     {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
                               1577                 :     {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
                               1578                 :     {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
                               1579                 : };
                               1580                 : 
                               1581                 : static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
                               1582                 :     {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
                               1583                 :     {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
                               1584                 :     {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
                               1585                 :     {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
                               1586                 :     {NULL, 0, A_POP, TPS_Null, 0, NULL}
                               1587                 : };
                               1588                 : 
                               1589                 : 
                               1590                 : /*
                               1591                 :  * main table of per-state parser actions
                               1592                 :  */
                               1593                 : typedef struct
                               1594                 : {
                               1595                 :     const TParserStateActionItem *action;   /* the actual state info */
                               1596                 :     TParserState state;         /* only for Assert crosscheck */
                               1597                 : #ifdef WPARSER_TRACE
                               1598                 :     const char *state_name;     /* only for debug printout */
                               1599                 : #endif
                               1600                 : } TParserStateAction;
                               1601                 : 
                               1602                 : #ifdef WPARSER_TRACE
                               1603                 : #define TPARSERSTATEACTION(state) \
                               1604                 :     { CppConcat(action,state), state, CppAsString(state) }
                               1605                 : #else
                               1606                 : #define TPARSERSTATEACTION(state) \
                               1607                 :     { CppConcat(action,state), state }
                               1608                 : #endif
                               1609                 : 
                               1610                 : /*
                               1611                 :  * order must be the same as in typedef enum {} TParserState!!
                               1612                 :  */
                               1613                 : 
                               1614                 : static const TParserStateAction Actions[] = {
                               1615                 :     TPARSERSTATEACTION(TPS_Base),
                               1616                 :     TPARSERSTATEACTION(TPS_InNumWord),
                               1617                 :     TPARSERSTATEACTION(TPS_InAsciiWord),
                               1618                 :     TPARSERSTATEACTION(TPS_InWord),
                               1619                 :     TPARSERSTATEACTION(TPS_InUnsignedInt),
                               1620                 :     TPARSERSTATEACTION(TPS_InSignedIntFirst),
                               1621                 :     TPARSERSTATEACTION(TPS_InSignedInt),
                               1622                 :     TPARSERSTATEACTION(TPS_InSpace),
                               1623                 :     TPARSERSTATEACTION(TPS_InUDecimalFirst),
                               1624                 :     TPARSERSTATEACTION(TPS_InUDecimal),
                               1625                 :     TPARSERSTATEACTION(TPS_InDecimalFirst),
                               1626                 :     TPARSERSTATEACTION(TPS_InDecimal),
                               1627                 :     TPARSERSTATEACTION(TPS_InVerVersion),
                               1628                 :     TPARSERSTATEACTION(TPS_InSVerVersion),
                               1629                 :     TPARSERSTATEACTION(TPS_InVersionFirst),
                               1630                 :     TPARSERSTATEACTION(TPS_InVersion),
                               1631                 :     TPARSERSTATEACTION(TPS_InMantissaFirst),
                               1632                 :     TPARSERSTATEACTION(TPS_InMantissaSign),
                               1633                 :     TPARSERSTATEACTION(TPS_InMantissa),
                               1634                 :     TPARSERSTATEACTION(TPS_InXMLEntityFirst),
                               1635                 :     TPARSERSTATEACTION(TPS_InXMLEntity),
                               1636                 :     TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
                               1637                 :     TPARSERSTATEACTION(TPS_InXMLEntityNum),
                               1638                 :     TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
                               1639                 :     TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
                               1640                 :     TPARSERSTATEACTION(TPS_InXMLEntityEnd),
                               1641                 :     TPARSERSTATEACTION(TPS_InTagFirst),
                               1642                 :     TPARSERSTATEACTION(TPS_InXMLBegin),
                               1643                 :     TPARSERSTATEACTION(TPS_InTagCloseFirst),
                               1644                 :     TPARSERSTATEACTION(TPS_InTagName),
                               1645                 :     TPARSERSTATEACTION(TPS_InTagBeginEnd),
                               1646                 :     TPARSERSTATEACTION(TPS_InTag),
                               1647                 :     TPARSERSTATEACTION(TPS_InTagEscapeK),
                               1648                 :     TPARSERSTATEACTION(TPS_InTagEscapeKK),
                               1649                 :     TPARSERSTATEACTION(TPS_InTagBackSleshed),
                               1650                 :     TPARSERSTATEACTION(TPS_InTagEnd),
                               1651                 :     TPARSERSTATEACTION(TPS_InCommentFirst),
                               1652                 :     TPARSERSTATEACTION(TPS_InCommentLast),
                               1653                 :     TPARSERSTATEACTION(TPS_InComment),
                               1654                 :     TPARSERSTATEACTION(TPS_InCloseCommentFirst),
                               1655                 :     TPARSERSTATEACTION(TPS_InCloseCommentLast),
                               1656                 :     TPARSERSTATEACTION(TPS_InCommentEnd),
                               1657                 :     TPARSERSTATEACTION(TPS_InHostFirstDomain),
                               1658                 :     TPARSERSTATEACTION(TPS_InHostDomainSecond),
                               1659                 :     TPARSERSTATEACTION(TPS_InHostDomain),
                               1660                 :     TPARSERSTATEACTION(TPS_InPortFirst),
                               1661                 :     TPARSERSTATEACTION(TPS_InPort),
                               1662                 :     TPARSERSTATEACTION(TPS_InHostFirstAN),
                               1663                 :     TPARSERSTATEACTION(TPS_InHost),
                               1664                 :     TPARSERSTATEACTION(TPS_InEmail),
                               1665                 :     TPARSERSTATEACTION(TPS_InFileFirst),
                               1666                 :     TPARSERSTATEACTION(TPS_InFileTwiddle),
                               1667                 :     TPARSERSTATEACTION(TPS_InPathFirst),
                               1668                 :     TPARSERSTATEACTION(TPS_InPathFirstFirst),
                               1669                 :     TPARSERSTATEACTION(TPS_InPathSecond),
                               1670                 :     TPARSERSTATEACTION(TPS_InFile),
                               1671                 :     TPARSERSTATEACTION(TPS_InFileNext),
                               1672                 :     TPARSERSTATEACTION(TPS_InURLPathFirst),
                               1673                 :     TPARSERSTATEACTION(TPS_InURLPathStart),
                               1674                 :     TPARSERSTATEACTION(TPS_InURLPath),
                               1675                 :     TPARSERSTATEACTION(TPS_InFURL),
                               1676                 :     TPARSERSTATEACTION(TPS_InProtocolFirst),
                               1677                 :     TPARSERSTATEACTION(TPS_InProtocolSecond),
                               1678                 :     TPARSERSTATEACTION(TPS_InProtocolEnd),
                               1679                 :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
                               1680                 :     TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
                               1681                 :     TPARSERSTATEACTION(TPS_InHyphenWordFirst),
                               1682                 :     TPARSERSTATEACTION(TPS_InHyphenWord),
                               1683                 :     TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
                               1684                 :     TPARSERSTATEACTION(TPS_InHyphenNumWord),
                               1685                 :     TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
                               1686                 :     TPARSERSTATEACTION(TPS_InParseHyphen),
                               1687                 :     TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
                               1688                 :     TPARSERSTATEACTION(TPS_InHyphenWordPart),
                               1689                 :     TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
                               1690                 :     TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
                               1691                 :     TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
                               1692                 : };
                               1693                 : 
                               1694                 : 
                               1695                 : static bool
 5624 bruce                    1696           14438 : TParserGet(TParser *prs)
                               1697                 : {
 5643 tgl                      1698           14438 :     const TParserStateActionItem *item = NULL;
                               1699                 : 
 5647                          1700           14438 :     Assert(prs->state);
                               1701                 : 
 5710                          1702           14438 :     if (prs->state->posbyte >= prs->lenstr)
                               1703            2365 :         return false;
                               1704                 : 
 5643                          1705           12073 :     prs->token = prs->str + prs->state->posbyte;
 5710                          1706           12073 :     prs->state->pushedAtAction = NULL;
                               1707                 : 
                               1708                 :     /* look at string */
                               1709           51585 :     while (prs->state->posbyte <= prs->lenstr)
                               1710                 :     {
                               1711           51585 :         if (prs->state->posbyte == prs->lenstr)
                               1712            2440 :             prs->state->charlen = 0;
                               1713                 :         else
                               1714           98290 :             prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
                               1715           49145 :                 pg_mblen(prs->str + prs->state->posbyte);
                               1716                 : 
                               1717           51585 :         Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
                               1718           51585 :         Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
                               1719           51585 :         Assert(Actions[prs->state->state].state == prs->state->state);
                               1720                 : 
 5643                          1721           51585 :         if (prs->state->pushedAtAction)
                               1722                 :         {
                               1723                 :             /* After a POP, pick up at the next test */
                               1724            1296 :             item = prs->state->pushedAtAction + 1;
                               1725            1296 :             prs->state->pushedAtAction = NULL;
                               1726                 :         }
                               1727                 :         else
                               1728                 :         {
                               1729           50289 :             item = Actions[prs->state->state].action;
                               1730           50289 :             Assert(item != NULL);
                               1731                 :         }
                               1732                 : 
                               1733                 :         /* find action by character class */
 5710                          1734          277734 :         while (item->isclass)
                               1735                 :         {
                               1736          262062 :             prs->c = item->c;
                               1737          262062 :             if (item->isclass(prs) != 0)
 5643                          1738           35913 :                 break;
 5710                          1739          226149 :             item++;
                               1740                 :         }
                               1741                 : 
                               1742                 : #ifdef WPARSER_TRACE
                               1743                 :         {
                               1744                 :             TParserPosition *ptr;
                               1745                 : 
                               1746                 :             fprintf(stderr, "state ");
                               1747                 :             /* indent according to stack depth */
                               1748                 :             for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
                               1749                 :                 fprintf(stderr, "  ");
                               1750                 :             fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
                               1751                 :             if (prs->state->posbyte < prs->lenstr)
                               1752                 :                 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
                               1753                 :             else
                               1754                 :                 fprintf(stderr, "at EOF");
                               1755                 :             fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
                               1756                 :                     (int) (item - Actions[prs->state->state].action),
                               1757                 :                     (item->flags & A_BINGO) ? " BINGO" : "",
                               1758                 :                     (item->flags & A_POP) ? " POP" : "",
                               1759                 :                     (item->flags & A_PUSH) ? " PUSH" : "",
                               1760                 :                     (item->flags & A_RERUN) ? " RERUN" : "",
                               1761                 :                     (item->flags & A_CLEAR) ? " CLEAR" : "",
                               1762                 :                     (item->flags & A_MERGE) ? " MERGE" : "",
                               1763                 :                     (item->flags & A_CLRALL) ? " CLRALL" : "",
                               1764                 :                     (item->tostate != TPS_Null) ? " tostate " : "",
                               1765                 :                     (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
                               1766                 :                     (item->type > 0) ? " type " : "",
                               1767                 :                     tok_alias[item->type]);
                               1768                 :         }
                               1769                 : #endif
                               1770                 : 
                               1771                 :         /* call special handler if exists */
                               1772           51585 :         if (item->special)
                               1773             210 :             item->special(prs);
                               1774                 : 
                               1775                 :         /* BINGO, token is found */
                               1776           51585 :         if (item->flags & A_BINGO)
                               1777                 :         {
                               1778           12073 :             Assert(item->type > 0);
 5643                          1779           12073 :             prs->lenbytetoken = prs->state->lenbytetoken;
                               1780           12073 :             prs->lenchartoken = prs->state->lenchartoken;
                               1781           12073 :             prs->state->lenbytetoken = prs->state->lenchartoken = 0;
 5710                          1782           12073 :             prs->type = item->type;
                               1783                 :         }
                               1784                 : 
                               1785                 :         /* do various actions by flags */
                               1786           51585 :         if (item->flags & A_POP)
                               1787                 :         {                       /* pop stored state in stack */
                               1788            1305 :             TParserPosition *ptr = prs->state->prev;
                               1789                 : 
                               1790            1305 :             pfree(prs->state);
                               1791            1305 :             prs->state = ptr;
                               1792            1305 :             Assert(prs->state);
                               1793                 :         }
                               1794           50280 :         else if (item->flags & A_PUSH)
                               1795                 :         {                       /* push (store) state in stack */
                               1796            2544 :             prs->state->pushedAtAction = item;    /* remember where we push */
                               1797            2544 :             prs->state = newTParserPosition(prs->state);
                               1798                 :         }
                               1799           47736 :         else if (item->flags & A_CLEAR)
                               1800                 :         {                       /* clear previous pushed state */
                               1801                 :             TParserPosition *ptr;
                               1802                 : 
                               1803             249 :             Assert(prs->state->prev);
                               1804             249 :             ptr = prs->state->prev->prev;
                               1805             249 :             pfree(prs->state->prev);
                               1806             249 :             prs->state->prev = ptr;
                               1807                 :         }
                               1808           47487 :         else if (item->flags & A_CLRALL)
                               1809                 :         {                       /* clear all previous pushed state */
                               1810                 :             TParserPosition *ptr;
                               1811                 : 
                               1812            1389 :             while (prs->state->prev)
                               1813                 :             {
                               1814             999 :                 ptr = prs->state->prev->prev;
                               1815             999 :                 pfree(prs->state->prev);
                               1816             999 :                 prs->state->prev = ptr;
                               1817                 :             }
                               1818                 :         }
                               1819           47097 :         else if (item->flags & A_MERGE)
                               1820                 :         {                       /* merge posinfo with current and pushed state */
 5710 tgl                      1821 UBC           0 :             TParserPosition *ptr = prs->state;
                               1822                 : 
                               1823               0 :             Assert(prs->state->prev);
                               1824               0 :             prs->state = prs->state->prev;
                               1825                 : 
                               1826               0 :             prs->state->posbyte = ptr->posbyte;
                               1827               0 :             prs->state->poschar = ptr->poschar;
                               1828               0 :             prs->state->charlen = ptr->charlen;
 5643                          1829               0 :             prs->state->lenbytetoken = ptr->lenbytetoken;
                               1830               0 :             prs->state->lenchartoken = ptr->lenchartoken;
 5710                          1831               0 :             pfree(ptr);
                               1832                 :         }
                               1833                 : 
                               1834                 :         /* set new state if pointed */
 5710 tgl                      1835 CBC       51585 :         if (item->tostate != TPS_Null)
                               1836           33077 :             prs->state->state = item->tostate;
                               1837                 : 
                               1838                 :         /* check for go away */
 5647                          1839           51585 :         if ((item->flags & A_BINGO) ||
                               1840           39512 :             (prs->state->posbyte >= prs->lenstr &&
 5647 tgl                      1841 UBC           0 :              (item->flags & A_RERUN) == 0))
                               1842                 :             break;
                               1843                 : 
                               1844                 :         /* go to beginning of loop if we should rerun or we just restore state */
 5710 tgl                      1845 CBC       39512 :         if (item->flags & (A_RERUN | A_POP))
                               1846            1317 :             continue;
                               1847                 : 
                               1848                 :         /* move forward */
                               1849           38195 :         if (prs->state->charlen)
                               1850                 :         {
                               1851           38195 :             prs->state->posbyte += prs->state->charlen;
 5643                          1852           38195 :             prs->state->lenbytetoken += prs->state->charlen;
 5710                          1853           38195 :             prs->state->poschar++;
 5643                          1854           38195 :             prs->state->lenchartoken++;
                               1855                 :         }
                               1856                 :     }
                               1857                 : 
  545 michael                  1858           12073 :     return (item && (item->flags & A_BINGO));
                               1859                 : }
                               1860                 : 
                               1861                 : Datum
 5710 tgl                      1862           25703 : prsd_lextype(PG_FUNCTION_ARGS)
                               1863                 : {
                               1864           25703 :     LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
                               1865                 :     int         i;
                               1866                 : 
                               1867          616872 :     for (i = 1; i <= LASTNUM; i++)
                               1868                 :     {
                               1869          591169 :         descr[i - 1].lexid = i;
                               1870          591169 :         descr[i - 1].alias = pstrdup(tok_alias[i]);
                               1871          591169 :         descr[i - 1].descr = pstrdup(lex_descr[i]);
                               1872                 :     }
                               1873                 : 
                               1874           25703 :     descr[LASTNUM].lexid = 0;
                               1875                 : 
                               1876           25703 :     PG_RETURN_POINTER(descr);
                               1877                 : }
                               1878                 : 
                               1879                 : Datum
                               1880            2365 : prsd_start(PG_FUNCTION_ARGS)
                               1881                 : {
                               1882            2365 :     PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
                               1883                 : }
                               1884                 : 
                               1885                 : Datum
                               1886           14318 : prsd_nexttoken(PG_FUNCTION_ARGS)
                               1887                 : {
                               1888           14318 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
                               1889           14318 :     char      **t = (char **) PG_GETARG_POINTER(1);
                               1890           14318 :     int        *tlen = (int *) PG_GETARG_POINTER(2);
                               1891                 : 
                               1892           14318 :     if (!TParserGet(p))
                               1893            2365 :         PG_RETURN_INT32(0);
                               1894                 : 
 5643                          1895           11953 :     *t = p->token;
                               1896           11953 :     *tlen = p->lenbytetoken;
                               1897                 : 
 5710                          1898           11953 :     PG_RETURN_INT32(p->type);
                               1899                 : }
                               1900                 : 
                               1901                 : Datum
                               1902            2365 : prsd_end(PG_FUNCTION_ARGS)
                               1903                 : {
                               1904            2365 :     TParser    *p = (TParser *) PG_GETARG_POINTER(0);
                               1905                 : 
                               1906            2365 :     TParserClose(p);
                               1907            2365 :     PG_RETURN_VOID();
                               1908                 : }
                               1909                 : 
                               1910                 : 
                               1911                 : /*
                               1912                 :  * ts_headline support begins here
                               1913                 :  */
                               1914                 : 
                               1915                 : /* token type classification macros */
                               1916                 : #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
                               1917                 : #define HLIDREPLACE(x)  ( (x)==TAG_T )
                               1918                 : #define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
                               1919                 : #define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
                               1920                 : #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
                               1921                 : #define NOENDTOKEN(x)   ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
                               1922                 : 
                               1923                 : /*
                               1924                 :  * Macros useful in headline selection.  These rely on availability of
                               1925                 :  * "HeadlineParsedText *prs" describing some text, and "int shortword"
                               1926                 :  * describing the "short word" length parameter.
                               1927                 :  */
                               1928                 : 
                               1929                 : /* Interesting words are non-repeated search terms */
                               1930                 : #define INTERESTINGWORD(j) \
                               1931                 :     (prs->words[j].item && !prs->words[j].repeated)
                               1932                 : 
                               1933                 : /* Don't want to end at a non-word or a short word, unless interesting */
                               1934                 : #define BADENDPOINT(j) \
                               1935                 :     ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
                               1936                 :      !INTERESTINGWORD(j))
                               1937                 : 
                               1938                 : typedef struct
                               1939                 : {
                               1940                 :     /* one cover (well, really one fragment) for mark_hl_fragments */
                               1941                 :     int32       startpos;       /* fragment's starting word index */
                               1942                 :     int32       endpos;         /* ending word index (inclusive) */
                               1943                 :     int32       poslen;         /* number of interesting words */
                               1944                 :     int32       curlen;         /* total number of words */
                               1945                 :     bool        chosen;         /* chosen? */
                               1946                 :     bool        excluded;       /* excluded? */
                               1947                 : } CoverPos;
                               1948                 : 
                               1949                 : typedef struct
                               1950                 : {
                               1951                 :     /* callback data for checkcondition_HL */
                               1952                 :     HeadlineWordEntry *words;
                               1953                 :     int         len;
                               1954                 : } hlCheck;
                               1955                 : 
                               1956                 : 
                               1957                 : /*
                               1958                 :  * TS_execute callback for matching a tsquery operand to headline words
                               1959                 :  *
                               1960                 :  * Note: it's tempting to report words[] indexes as pos values to save
                               1961                 :  * searching in hlCover; but that would screw up phrase matching, which
                               1962                 :  * expects to measure distances in lexemes not tokens.
                               1963                 :  */
                               1964                 : static TSTernaryValue
 2558 teodor                   1965 GIC         500 : checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
                               1966                 : {
 2495 rhaas                    1967             500 :     hlCheck    *checkval = (hlCheck *) opaque;
                               1968                 :     int         i;
 5710 tgl                      1969 ECB             : 
                               1970                 :     /* scan words array for matching items */
 2558 teodor                   1971 CBC       12725 :     for (i = 0; i < checkval->len; i++)
                               1972                 :     {
 2558 teodor                   1973 GIC       12325 :         if (checkval->words[i].item == val)
                               1974                 :         {
 1095 tgl                      1975 ECB             :             /* if data == NULL, don't need to report positions */
 2558 teodor                   1976 GIC         437 :             if (!data)
  989 tgl                      1977 CBC         100 :                 return TS_YES;
                               1978                 : 
 2558 teodor                   1979 GIC         337 :             if (!data->pos)
 2558 teodor                   1980 ECB             :             {
 2558 teodor                   1981 CBC         238 :                 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
 2558 teodor                   1982 GIC         238 :                 data->allocated = true;
 2558 teodor                   1983 CBC         238 :                 data->npos = 1;
 2558 teodor                   1984 GIC         238 :                 data->pos[0] = checkval->words[i].pos;
 2558 teodor                   1985 ECB             :             }
 2558 teodor                   1986 CBC          99 :             else if (data->pos[data->npos - 1] < checkval->words[i].pos)
 2558 teodor                   1987 ECB             :             {
 2558 teodor                   1988 CBC          99 :                 data->pos[data->npos++] = checkval->words[i].pos;
                               1989                 :             }
 2558 teodor                   1990 ECB             :         }
                               1991                 :     }
                               1992                 : 
 2558 teodor                   1993 GIC         400 :     if (data && data->npos > 0)
  989 tgl                      1994             238 :         return TS_YES;
                               1995                 : 
                               1996             162 :     return TS_NO;
 5710 tgl                      1997 ECB             : }
                               1998                 : 
                               1999                 : /*
                               2000                 :  * hlCover: try to find a substring of prs' word list that satisfies query
                               2001                 :  *
                               2002                 :  * locations is the result of TS_execute_locations() for the query.
                               2003                 :  * We use this to identify plausible subranges of the query.
                               2004                 :  *
                               2005                 :  * *nextpos is the lexeme position (NOT word index) to start the search
                               2006                 :  * at.  Caller should initialize this to zero.  If successful, we'll
                               2007                 :  * advance it to the next place to search at.
                               2008                 :  *
                               2009                 :  * On success, sets *p to first word index and *q to last word index of the
                               2010                 :  * cover substring, and returns true.
                               2011                 :  *
                               2012                 :  * The result is a minimal cover, in the sense that both *p and *q will be
                               2013                 :  * words used in the query.
                               2014                 :  */
                               2015                 : static bool
   80 tgl                      2016 GNC         281 : hlCover(HeadlineParsedText *prs, TSQuery query, List *locations,
                               2017                 :         int *nextpos, int *p, int *q)
                               2018                 : {
                               2019             281 :     int         pos = *nextpos;
 5710 tgl                      2020 ECB             : 
                               2021                 :     /* This loop repeats when our selected word-range fails the query */
                               2022                 :     for (;;)
 5710 tgl                      2023 CBC          30 :     {
                               2024                 :         int         posb,
                               2025                 :                     pose;
                               2026                 :         ListCell   *lc;
   80 tgl                      2027 ECB             : 
                               2028                 :         /*
                               2029                 :          * For each AND'ed query term or phrase, find its first occurrence at
                               2030                 :          * or after pos; set pose to the maximum of those positions.
                               2031                 :          *
                               2032                 :          * We need not consider ORs or NOTs here; see the comments for
                               2033                 :          * TS_execute_locations().  Rechecking the match with TS_execute(),
                               2034                 :          * below, will deal with any ensuing imprecision.
                               2035                 :          */
   80 tgl                      2036 GNC         311 :         pose = -1;
                               2037             483 :         foreach(lc, locations)
                               2038                 :         {
                               2039             233 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
                               2040             233 :             int         first = -1;
                               2041                 : 
                               2042             396 :             for (int i = 0; i < pdata->npos; i++)
                               2043                 :             {
                               2044                 :                 /* For phrase matches, use the ending lexeme */
                               2045             335 :                 int         endp = pdata->pos[i];
                               2046                 : 
                               2047             335 :                 if (endp >= pos)
                               2048                 :                 {
                               2049             172 :                     first = endp;
                               2050             172 :                     break;
                               2051                 :                 }
                               2052                 :             }
                               2053             233 :             if (first < 0)
                               2054              61 :                 return false;   /* no more matches for this term */
                               2055             172 :             if (first > pose)
                               2056             163 :                 pose = first;
   80 tgl                      2057 ECB             :         }
                               2058                 : 
   80 tgl                      2059 GNC         250 :         if (pose < 0)
                               2060             123 :             return false;       /* we only get here if empty list */
                               2061                 : 
                               2062                 :         /*
                               2063                 :          * Now, for each AND'ed query term or phrase, find its last occurrence
                               2064                 :          * at or before pose; set posb to the minimum of those positions.
                               2065                 :          *
                               2066                 :          * We start posb at INT_MAX - 1 to guarantee no overflow if we compute
                               2067                 :          * posb + 1 below.
                               2068                 :          */
                               2069             127 :         posb = INT_MAX - 1;
                               2070             293 :         foreach(lc, locations)
                               2071                 :         {
                               2072             166 :             ExecPhraseData *pdata = (ExecPhraseData *) lfirst(lc);
                               2073             166 :             int         last = -1;
                               2074                 : 
                               2075             247 :             for (int i = pdata->npos - 1; i >= 0; i--)
                               2076                 :             {
                               2077                 :                 /* For phrase matches, use the starting lexeme */
                               2078             247 :                 int         startp = pdata->pos[i] - pdata->width;
                               2079                 : 
                               2080             247 :                 if (startp <= pose)
                               2081                 :                 {
                               2082             166 :                     last = startp;
                               2083             166 :                     break;
                               2084                 :                 }
                               2085                 :             }
                               2086             166 :             if (last < posb)
                               2087             136 :                 posb = last;
                               2088                 :         }
                               2089                 : 
                               2090                 :         /*
                               2091                 :          * We could end up with posb to the left of pos, in case some phrase
                               2092                 :          * match crosses pos.  Try the match starting at pos anyway, since the
                               2093                 :          * result of TS_execute_locations is imprecise for phrase matches OR'd
                               2094                 :          * with plain matches; that is, if the query is "(A <-> B) | C" then C
                               2095                 :          * could match at pos even though the phrase match would have to
                               2096                 :          * extend to the left of pos.
                               2097                 :          */
                               2098             127 :         posb = Max(posb, pos);
                               2099                 : 
                               2100                 :         /* This test probably always succeeds, but be paranoid */
                               2101             127 :         if (posb <= pose)
                               2102                 :         {
                               2103                 :             /*
                               2104                 :              * posb .. pose is now the shortest, earliest-after-pos range of
                               2105                 :              * lexeme positions containing all the query terms.  It will
                               2106                 :              * contain all phrase matches, too, except in the corner case
                               2107                 :              * described just above.
                               2108                 :              *
                               2109                 :              * Now convert these lexeme positions to indexes in prs->words[].
                               2110                 :              */
                               2111             127 :             int         idxb = -1;
                               2112             127 :             int         idxe = -1;
                               2113                 : 
                               2114            5812 :             for (int i = 0; i < prs->curwords; i++)
                               2115                 :             {
                               2116            5748 :                 if (prs->words[i].item == NULL)
                               2117            5306 :                     continue;
                               2118             442 :                 if (idxb < 0 && prs->words[i].pos >= posb)
                               2119             127 :                     idxb = i;
                               2120             442 :                 if (prs->words[i].pos <= pose)
                               2121             379 :                     idxe = i;
                               2122                 :                 else
                               2123              63 :                     break;
                               2124                 :             }
                               2125                 : 
                               2126                 :             /* This test probably always succeeds, but be paranoid */
                               2127             127 :             if (idxb >= 0 && idxe >= idxb)
                               2128                 :             {
                               2129                 :                 /*
                               2130                 :                  * Finally, check that the selected range satisfies the query.
                               2131                 :                  * This should succeed in all simple cases; but odd cases
                               2132                 :                  * involving non-top-level NOT conditions or phrase matches
                               2133                 :                  * OR'd with other things could fail, since the result of
                               2134                 :                  * TS_execute_locations doesn't fully represent such things.
                               2135                 :                  */
                               2136                 :                 hlCheck     ch;
                               2137                 : 
                               2138             127 :                 ch.words = &(prs->words[idxb]);
                               2139             127 :                 ch.len = idxe - idxb + 1;
                               2140             127 :                 if (TS_execute(GETQUERY(query), &ch,
                               2141                 :                                TS_EXEC_EMPTY, checkcondition_HL))
                               2142                 :                 {
                               2143                 :                     /* Match!  Advance *nextpos and return the word range. */
                               2144              97 :                     *nextpos = posb + 1;
                               2145              97 :                     *p = idxb;
                               2146              97 :                     *q = idxe;
                               2147              97 :                     return true;
                               2148                 :                 }
                               2149                 :             }
                               2150                 :         }
                               2151                 : 
                               2152                 :         /*
                               2153                 :          * Advance pos and try again.  Any later workable match must start
                               2154                 :          * beyond posb.
                               2155                 :          */
                               2156              30 :         pos = posb + 1;
 5710 tgl                      2157 ECB             :     }
                               2158                 :     /* Can't get here, but stupider compilers complain if we leave it off */
                               2159                 :     return false;
                               2160                 : }
                               2161                 : 
 1095                          2162                 : /*
                               2163                 :  * Apply suitable highlight marking to words selected by headline selector
                               2164                 :  *
                               2165                 :  * The words from startpos to endpos inclusive are marked per highlightall
                               2166                 :  */
                               2167                 : static void
 1095 tgl                      2168 GIC         193 : mark_fragment(HeadlineParsedText *prs, bool highlightall,
                               2169                 :               int startpos, int endpos)
                               2170                 : {
                               2171                 :     int         i;
                               2172                 : 
 5287 teodor                   2173            2827 :     for (i = startpos; i <= endpos; i++)
 5287 teodor                   2174 ECB             :     {
 5287 teodor                   2175 GIC        2634 :         if (prs->words[i].item)
                               2176             250 :             prs->words[i].selected = 1;
 1095 tgl                      2177 CBC        2634 :         if (!highlightall)
                               2178                 :         {
 5197 teodor                   2179 GIC        2511 :             if (HLIDREPLACE(prs->words[i].type))
 5287 teodor                   2180 UIC           0 :                 prs->words[i].replace = 1;
 4382 bruce                    2181 GIC        2511 :             else if (HLIDSKIP(prs->words[i].type))
 5197 teodor                   2182 UIC           0 :                 prs->words[i].skip = 1;
                               2183                 :         }
                               2184                 :         else
                               2185                 :         {
 5197 teodor                   2186 GIC         123 :             if (XMLHLIDSKIP(prs->words[i].type))
 5197 teodor                   2187 CBC           3 :                 prs->words[i].skip = 1;
 5287 teodor                   2188 ECB             :         }
                               2189                 : 
 5287 teodor                   2190 CBC        2634 :         prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
                               2191                 :     }
                               2192             193 : }
 5287 teodor                   2193 ECB             : 
 1095 tgl                      2194                 : /*
                               2195                 :  * split a cover substring into fragments not longer than max_words
                               2196                 :  *
                               2197                 :  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
                               2198                 :  * substring.  They are updated to hold the bounds of the next fragment.
                               2199                 :  *
                               2200                 :  * *curlen and *poslen are set to the fragment's length, in words and
                               2201                 :  * interesting words respectively.
                               2202                 :  */
 4520 peter_e                  2203                 : static void
 5287 teodor                   2204 GIC          18 : get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
                               2205                 :                   int *curlen, int *poslen, int max_words)
                               2206                 : {
                               2207                 :     int         i;
                               2208                 : 
                               2209                 :     /*
                               2210                 :      * Objective: select a fragment of words between startpos and endpos such
                               2211                 :      * that it has at most max_words and both ends have query words. If the
                               2212                 :      * startpos and endpos are the endpoints of the cover and the cover has
                               2213                 :      * fewer words than max_words, then this function should just return the
 1095 tgl                      2214 ECB             :      * cover
 5287 teodor                   2215                 :      */
                               2216                 :     /* first move startpos to an item */
 4382 bruce                    2217 GIC         444 :     for (i = *startpos; i <= *endpos; i++)
                               2218                 :     {
 5287 teodor                   2219             444 :         *startpos = i;
 1095 tgl                      2220 CBC         444 :         if (INTERESTINGWORD(i))
 5287 teodor                   2221              18 :             break;
 5287 teodor                   2222 ECB             :     }
                               2223                 :     /* cut endpos to have only max_words */
 5287 teodor                   2224 GIC          18 :     *curlen = 0;
                               2225              18 :     *poslen = 0;
 4382 bruce                    2226             480 :     for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
                               2227                 :     {
 5287 teodor                   2228             462 :         if (!NONWORDTOKEN(prs->words[i].type))
                               2229             240 :             *curlen += 1;
 1095 tgl                      2230             462 :         if (INTERESTINGWORD(i))
 5287 teodor                   2231              27 :             *poslen += 1;
 5287 teodor                   2232 ECB             :     }
                               2233                 :     /* if the cover was cut then move back endpos to a query item */
 5287 teodor                   2234 GIC          18 :     if (*endpos > i)
                               2235                 :     {
                               2236               6 :         *endpos = i;
 4382 bruce                    2237             420 :         for (i = *endpos; i >= *startpos; i--)
                               2238                 :         {
 5287 teodor                   2239             420 :             *endpos = i;
 1095 tgl                      2240             420 :             if (INTERESTINGWORD(i))
 5287 teodor                   2241               6 :                 break;
                               2242             414 :             if (!NONWORDTOKEN(prs->words[i].type))
                               2243             204 :                 *curlen -= 1;
 4520 peter_e                  2244 ECB             :         }
                               2245                 :     }
 5287 teodor                   2246 GIC          18 : }
                               2247                 : 
                               2248                 : /*
 1095 tgl                      2249 ECB             :  * Headline selector used when MaxFragments > 0
                               2250                 :  *
                               2251                 :  * Note: in this mode, highlightall is disregarded for phrase selection;
                               2252                 :  * it only controls presentation details.
                               2253                 :  */
                               2254                 : static void
   80 tgl                      2255 GNC          15 : mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, List *locations,
                               2256                 :                   bool highlightall,
 4382 bruce                    2257 EUB             :                   int shortword, int min_words,
                               2258                 :                   int max_words, int max_fragments)
 5287 teodor                   2259                 : {
                               2260                 :     int32       poslen,
                               2261                 :                 curlen,
                               2262                 :                 i,
 4382 bruce                    2263 ECB             :                 f,
 4382 bruce                    2264 CBC          15 :                 num_f = 0;
                               2265                 :     int32       stretch,
                               2266                 :                 maxstretch,
 4382 bruce                    2267 ECB             :                 posmarker;
                               2268                 : 
 3940 peter_e                  2269 CBC          15 :     int32       startpos = 0,
 4382 bruce                    2270 GIC          15 :                 endpos = 0,
   80 tgl                      2271 GNC          15 :                 nextpos = 0,
 4382 bruce                    2272 GIC          15 :                 p = 0,
                               2273              15 :                 q = 0;
                               2274                 : 
 3940 peter_e                  2275              15 :     int32       numcovers = 0,
 4382 bruce                    2276              15 :                 maxcovers = 32;
                               2277                 : 
                               2278                 :     int32       minI,
                               2279                 :                 minwords,
                               2280                 :                 maxitems;
                               2281                 :     CoverPos   *covers;
 5287 teodor                   2282 ECB             : 
 5287 teodor                   2283 GIC          15 :     covers = palloc(maxcovers * sizeof(CoverPos));
                               2284                 : 
                               2285                 :     /* get all covers */
   80 tgl                      2286 GNC          27 :     while (hlCover(prs, query, locations, &nextpos, &p, &q))
                               2287                 :     {
 5287 teodor                   2288 GIC          12 :         startpos = p;
 4382 bruce                    2289              12 :         endpos = q;
                               2290                 : 
                               2291                 :         /*
                               2292                 :          * Break the cover into smaller fragments such that each fragment has
                               2293                 :          * at most max_words. Also ensure that each end of each fragment is a
                               2294                 :          * query word. This will allow us to stretch the fragment in either
 4382 bruce                    2295 ECB             :          * direction
                               2296                 :          */
 5287 teodor                   2297                 : 
 5287 teodor                   2298 CBC          30 :         while (startpos <= endpos)
 5287 teodor                   2299 ECB             :         {
 5287 teodor                   2300 GIC          18 :             get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
                               2301              18 :             if (numcovers >= maxcovers)
 5287 teodor                   2302 ECB             :             {
 5287 teodor                   2303 LBC           0 :                 maxcovers *= 2;
 4382 bruce                    2304               0 :                 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
                               2305                 :             }
 5287 teodor                   2306 CBC          18 :             covers[numcovers].startpos = startpos;
 4382 bruce                    2307              18 :             covers[numcovers].endpos = endpos;
                               2308              18 :             covers[numcovers].curlen = curlen;
                               2309              18 :             covers[numcovers].poslen = poslen;
 1095 tgl                      2310 GIC          18 :             covers[numcovers].chosen = false;
                               2311              18 :             covers[numcovers].excluded = false;
 4382 bruce                    2312 CBC          18 :             numcovers++;
 5287 teodor                   2313 GIC          18 :             startpos = endpos + 1;
 4382 bruce                    2314 CBC          18 :             endpos = q;
 4520 peter_e                  2315 ECB             :         }
 5287 teodor                   2316                 :     }
 5710 tgl                      2317                 : 
 5287 teodor                   2318                 :     /* choose best covers */
 5287 teodor                   2319 GIC          33 :     for (f = 0; f < max_fragments; f++)
                               2320                 :     {
 5287 teodor                   2321 CBC          24 :         maxitems = 0;
 2929 andres                   2322 GIC          24 :         minwords = PG_INT32_MAX;
 5287 teodor                   2323              24 :         minI = -1;
                               2324                 : 
                               2325                 :         /*
                               2326                 :          * Choose the cover that contains max items. In case of tie choose the
                               2327                 :          * one with smaller number of words.
                               2328                 :          */
 4382 bruce                    2329              57 :         for (i = 0; i < numcovers; i++)
 5287 teodor                   2330 ECB             :         {
 1095 tgl                      2331 GIC          33 :             if (!covers[i].chosen && !covers[i].excluded &&
                               2332              24 :                 (maxitems < covers[i].poslen ||
                               2333               6 :                  (maxitems == covers[i].poslen &&
                               2334               6 :                   minwords > covers[i].curlen)))
                               2335                 :             {
 5287 teodor                   2336              18 :                 maxitems = covers[i].poslen;
                               2337              18 :                 minwords = covers[i].curlen;
 4382 bruce                    2338              18 :                 minI = i;
 5287 teodor                   2339 ECB             :             }
                               2340                 :         }
                               2341                 :         /* if a cover was found mark it */
 5287 teodor                   2342 GIC          24 :         if (minI >= 0)
                               2343                 :         {
 1095 tgl                      2344 CBC          18 :             covers[minI].chosen = true;
 5287 teodor                   2345 ECB             :             /* adjust the size of cover */
 5287 teodor                   2346 CBC          18 :             startpos = covers[minI].startpos;
 4382 bruce                    2347              18 :             endpos = covers[minI].endpos;
                               2348              18 :             curlen = covers[minI].curlen;
                               2349                 :             /* stretch the cover if cover size is lower than max_words */
 4520 peter_e                  2350              18 :             if (curlen < max_words)
 5287 teodor                   2351 ECB             :             {
                               2352                 :                 /* divide the stretch on both sides of cover */
 4382 bruce                    2353 GIC          18 :                 maxstretch = (max_words - curlen) / 2;
                               2354                 : 
                               2355                 :                 /*
                               2356                 :                  * first stretch the startpos stop stretching if 1. we hit the
                               2357                 :                  * beginning of document 2. exceed maxstretch 3. we hit an
 4382 bruce                    2358 ECB             :                  * already marked fragment
                               2359                 :                  */
 4382 bruce                    2360 GIC          18 :                 stretch = 0;
 5287 teodor                   2361 CBC          18 :                 posmarker = startpos;
 5287 teodor                   2362 GIC         300 :                 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
 5287 teodor                   2363 ECB             :                 {
 5287 teodor                   2364 CBC         282 :                     if (!NONWORDTOKEN(prs->words[i].type))
                               2365                 :                     {
 4382 bruce                    2366 GIC         135 :                         curlen++;
                               2367             135 :                         stretch++;
                               2368                 :                     }
 5287 teodor                   2369             282 :                     posmarker = i;
                               2370                 :                 }
                               2371                 :                 /* cut back startpos till we find a good endpoint */
 1095 tgl                      2372              66 :                 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
 5287 teodor                   2373 ECB             :                 {
 5287 teodor                   2374 GIC          48 :                     if (!NONWORDTOKEN(prs->words[i].type))
 4382 bruce                    2375 CBC          18 :                         curlen--;
 5287 teodor                   2376 ECB             :                 }
 5287 teodor                   2377 GIC          18 :                 startpos = i;
 4382 bruce                    2378 EUB             :                 /* now stretch the endpos as much as possible */
 5287 teodor                   2379 GBC          18 :                 posmarker = endpos;
 5287 teodor                   2380 GIC         483 :                 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
 5287 teodor                   2381 ECB             :                 {
 5287 teodor                   2382 CBC         465 :                     if (!NONWORDTOKEN(prs->words[i].type))
 4382 bruce                    2383             231 :                         curlen++;
 4520 peter_e                  2384             465 :                     posmarker = i;
 5287 teodor                   2385 ECB             :                 }
 1095 tgl                      2386                 :                 /* cut back endpos till we find a good endpoint */
 1095 tgl                      2387 CBC          45 :                 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
 5287 teodor                   2388 ECB             :                 {
 5287 teodor                   2389 CBC          27 :                     if (!NONWORDTOKEN(prs->words[i].type))
 4382 bruce                    2390 GIC          12 :                         curlen--;
                               2391                 :                 }
 5287 teodor                   2392              18 :                 endpos = i;
                               2393                 :             }
 5287 teodor                   2394 CBC          18 :             covers[minI].startpos = startpos;
 4382 bruce                    2395 GIC          18 :             covers[minI].endpos = endpos;
 4382 bruce                    2396 CBC          18 :             covers[minI].curlen = curlen;
 5287 teodor                   2397 ECB             :             /* Mark the chosen fragments (covers) */
 1095 tgl                      2398 CBC          18 :             mark_fragment(prs, highlightall, startpos, endpos);
 4382 bruce                    2399 GIC          18 :             num_f++;
                               2400                 :             /* Exclude covers overlapping this one from future consideration */
                               2401              48 :             for (i = 0; i < numcovers; i++)
                               2402                 :             {
 1095 tgl                      2403              30 :                 if (i != minI &&
 1095 tgl                      2404 CBC          12 :                     ((covers[i].startpos >= startpos &&
 1095 tgl                      2405 GIC           6 :                       covers[i].startpos <= endpos) ||
 1095 tgl                      2406 CBC          12 :                      (covers[i].endpos >= startpos &&
                               2407               6 :                       covers[i].endpos <= endpos) ||
                               2408              12 :                      (covers[i].startpos < startpos &&
                               2409               6 :                       covers[i].endpos > endpos)))
 1095 tgl                      2410 UIC           0 :                     covers[i].excluded = true;
 5287 teodor                   2411 ECB             :             }
                               2412                 :         }
                               2413                 :         else
 1095 tgl                      2414 GIC           6 :             break;              /* no selectable covers remain */
                               2415                 :     }
                               2416                 : 
 1095 tgl                      2417 ECB             :     /* show the first min_words words if we have not marked anything */
 5287 teodor                   2418 GIC          15 :     if (num_f <= 0)
 5287 teodor                   2419 ECB             :     {
    3 tgl                      2420 GNC           3 :         startpos = curlen = 0;
                               2421               3 :         endpos = -1;
 5287 teodor                   2422 CBC          93 :         for (i = 0; i < prs->curwords && curlen < min_words; i++)
 5287 teodor                   2423 ECB             :         {
 5287 teodor                   2424 CBC          90 :             if (!NONWORDTOKEN(prs->words[i].type))
 5287 teodor                   2425 GIC          45 :                 curlen++;
 5287 teodor                   2426 CBC          90 :             endpos = i;
                               2427                 :         }
 1095 tgl                      2428 GIC           3 :         mark_fragment(prs, highlightall, startpos, endpos);
 5287 teodor                   2429 ECB             :     }
                               2430                 : 
 5287 teodor                   2431 GIC          15 :     pfree(covers);
                               2432              15 : }
                               2433                 : 
                               2434                 : /*
                               2435                 :  * Headline selector used when MaxFragments == 0
 1095 tgl                      2436 ECB             :  */
 5287 teodor                   2437                 : static void
   80 tgl                      2438 GNC         172 : mark_hl_words(HeadlineParsedText *prs, TSQuery query, List *locations,
                               2439                 :               bool highlightall,
                               2440                 :               int shortword, int min_words, int max_words)
 5287 teodor                   2441 ECB             : {
   80 tgl                      2442 GNC         172 :     int         nextpos = 0,
                               2443             172 :                 p = 0,
 5710 tgl                      2444 CBC         172 :                 q = 0;
                               2445             172 :     int         bestb = -1,
 5710 tgl                      2446 GIC         172 :                 beste = -1;
 5710 tgl                      2447 CBC         172 :     int         bestlen = -1;
 1095 tgl                      2448 GIC         172 :     bool        bestcover = false;
                               2449                 :     int         pose,
 5710 tgl                      2450 ECB             :                 posb,
                               2451                 :                 poslen,
                               2452                 :                 curlen;
 1095                          2453                 :     bool        poscover;
                               2454                 :     int         i;
 5710                          2455                 : 
 1095 tgl                      2456 GIC         172 :     if (!highlightall)
 5710 tgl                      2457 ECB             :     {
 1095                          2458                 :         /* examine all covers, select a headline using the best one */
   80 tgl                      2459 GNC         254 :         while (hlCover(prs, query, locations, &nextpos, &p, &q))
 5710 tgl                      2460 ECB             :         {
 1095                          2461                 :             /*
                               2462                 :              * Count words (curlen) and interesting words (poslen) within
                               2463                 :              * cover, but stop once we reach max_words.  This step doesn't
                               2464                 :              * consider whether that's a good stopping point.  posb and pose
                               2465                 :              * are set to the start and end indexes of the possible headline.
                               2466                 :              */
 5710 tgl                      2467 CBC          85 :             curlen = 0;
                               2468              85 :             poslen = 0;
 1095 tgl                      2469 GIC          85 :             posb = pose = p;
 5710 tgl                      2470 CBC         728 :             for (i = p; i <= q && curlen < max_words; i++)
                               2471                 :             {
                               2472             643 :                 if (!NONWORDTOKEN(prs->words[i].type))
                               2473             364 :                     curlen++;
 1095                          2474             643 :                 if (INTERESTINGWORD(i))
 5710 tgl                      2475 GIC         145 :                     poslen++;
 5710 tgl                      2476 CBC         643 :                 pose = i;
 5710 tgl                      2477 ECB             :             }
                               2478                 : 
 5710 tgl                      2479 CBC          85 :             if (curlen < max_words)
                               2480                 :             {
 1095 tgl                      2481 ECB             :                 /*
                               2482                 :                  * We have room to lengthen the headline, so search forward
                               2483                 :                  * until it's full or we find a good stopping point.  We'll
                               2484                 :                  * reconsider the word at "q", then move forward.
                               2485                 :                  */
 5710 tgl                      2486 CBC        1469 :                 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
 5710 tgl                      2487 ECB             :                 {
 1095 tgl                      2488 GBC        1456 :                     if (i > q)
                               2489                 :                     {
 5710 tgl                      2490 GIC        1377 :                         if (!NONWORDTOKEN(prs->words[i].type))
                               2491             687 :                             curlen++;
 1095 tgl                      2492 CBC        1377 :                         if (INTERESTINGWORD(i))
 5710 tgl                      2493 GIC          60 :                             poslen++;
                               2494                 :                     }
                               2495            1456 :                     pose = i;
 1095 tgl                      2496 CBC        1456 :                     if (BADENDPOINT(i))
 5710 tgl                      2497 GIC         972 :                         continue;
 5710 tgl                      2498 CBC         484 :                     if (curlen >= min_words)
                               2499              66 :                         break;
 5710 tgl                      2500 ECB             :                 }
 1095 tgl                      2501 GIC          79 :                 if (curlen < min_words)
 1095 tgl                      2502 ECB             :                 {
                               2503                 :                     /*
                               2504                 :                      * Reached end of text and our headline is still shorter
                               2505                 :                      * than min_words, so try to extend it to the left.
                               2506                 :                      */
 5197 teodor                   2507 GIC         183 :                     for (i = p - 1; i >= 0; i--)
                               2508                 :                     {
 5710 tgl                      2509 CBC         182 :                         if (!NONWORDTOKEN(prs->words[i].type))
                               2510              91 :                             curlen++;
 1095 tgl                      2511 GIC         182 :                         if (INTERESTINGWORD(i))
 5710                          2512               3 :                             poslen++;
 4382 bruce                    2513             182 :                         if (curlen >= max_words)
 5197 teodor                   2514 UIC           0 :                             break;
 1095 tgl                      2515 GIC         182 :                         if (BADENDPOINT(i))
 5710 tgl                      2516 CBC         118 :                             continue;
 5710 tgl                      2517 GIC          64 :                         if (curlen >= min_words)
                               2518              12 :                             break;
                               2519                 :                     }
 5710 tgl                      2520 CBC          13 :                     posb = (i >= 0) ? i : 0;
 5710 tgl                      2521 ECB             :                 }
                               2522                 :             }
                               2523                 :             else
 1095                          2524                 :             {
                               2525                 :                 /*
                               2526                 :                  * Can't make headline longer, so consider making it shorter
                               2527                 :                  * if needed to avoid a bad endpoint.
                               2528                 :                  */
 2557 teodor                   2529 GIC           6 :                 if (i > q)
                               2530               3 :                     i = q;
 5710 tgl                      2531              15 :                 for (; curlen > min_words; i--)
                               2532                 :                 {
 1095                          2533              15 :                     if (!BADENDPOINT(i))
 1095 tgl                      2534 ECB             :                         break;
 5710 tgl                      2535 GIC           9 :                     if (!NONWORDTOKEN(prs->words[i].type))
                               2536               3 :                         curlen--;
 1095 tgl                      2537 CBC           9 :                     if (INTERESTINGWORD(i))
 5710 tgl                      2538 UIC           0 :                         poslen--;
 1095 tgl                      2539 GIC           9 :                     pose = i - 1;
                               2540                 :                 }
                               2541                 :             }
                               2542                 : 
                               2543                 :             /*
                               2544                 :              * Check whether the proposed headline includes the original
 1095 tgl                      2545 ECB             :              * cover; it might not if we trimmed it due to max_words.
                               2546                 :              */
 1095 tgl                      2547 CBC          85 :             poscover = (posb <= p && pose >= q);
 1095 tgl                      2548 ECB             : 
                               2549                 :             /*
                               2550                 :              * Adopt this headline if it's better than the last one, giving
                               2551                 :              * highest priority to headlines including the cover, then to
                               2552                 :              * headlines with more interesting words, then to headlines with
                               2553                 :              * good stopping points.  (Since bestlen is initially -1, we will
                               2554                 :              * certainly adopt the first headline.)
                               2555                 :              */
 1095 tgl                      2556 GIC          85 :             if (poscover > bestcover ||
 1095 tgl                      2557 CBC          39 :                 (poscover == bestcover && poslen > bestlen) ||
 1095 tgl                      2558 GIC          36 :                 (poscover == bestcover && poslen == bestlen &&
                               2559               6 :                  !BADENDPOINT(pose) && BADENDPOINT(beste)))
                               2560                 :             {
 5710                          2561              49 :                 bestb = posb;
                               2562              49 :                 beste = pose;
                               2563              49 :                 bestlen = poslen;
 1095 tgl                      2564 CBC          49 :                 bestcover = poscover;
                               2565                 :             }
 5710 tgl                      2566 ECB             :         }
                               2567                 : 
 1095                          2568                 :         /*
                               2569                 :          * If we found nothing acceptable, select min_words words starting at
                               2570                 :          * the beginning.
                               2571                 :          */
 5710 tgl                      2572 CBC         169 :         if (bestlen < 0)
 5710 tgl                      2573 ECB             :         {
 5710 tgl                      2574 CBC         120 :             curlen = 0;
    3 tgl                      2575 GNC         120 :             pose = -1;
 5710 tgl                      2576 CBC         519 :             for (i = 0; i < prs->curwords && curlen < min_words; i++)
                               2577                 :             {
 5710 tgl                      2578 GIC         399 :                 if (!NONWORDTOKEN(prs->words[i].type))
                               2579             258 :                     curlen++;
                               2580             399 :                 pose = i;
                               2581                 :             }
 5710 tgl                      2582 CBC         120 :             bestb = 0;
 5710 tgl                      2583 GIC         120 :             beste = pose;
 5710 tgl                      2584 ECB             :         }
                               2585                 :     }
                               2586                 :     else
                               2587                 :     {
 1095                          2588                 :         /* highlightall mode: headline is whole document */
 5710 tgl                      2589 GBC           3 :         bestb = 0;
 5710 tgl                      2590 CBC           3 :         beste = prs->curwords - 1;
 5710 tgl                      2591 ECB             :     }
                               2592                 : 
 1095 tgl                      2593 CBC         172 :     mark_fragment(prs, highlightall, bestb, beste);
 5287 teodor                   2594 GIC         172 : }
 5287 teodor                   2595 ECB             : 
                               2596                 : /*
                               2597                 :  * Default parser's prsheadline function
                               2598                 :  */
                               2599                 : Datum
 5287 teodor                   2600 GIC         187 : prsd_headline(PG_FUNCTION_ARGS)
                               2601                 : {
                               2602             187 :     HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
                               2603             187 :     List       *prsoptions = (List *) PG_GETARG_POINTER(1);
 5287 teodor                   2604 CBC         187 :     TSQuery     query = PG_GETARG_TSQUERY(2);
                               2605                 :     List       *locations;
 5287 teodor                   2606 ECB             : 
 1095 tgl                      2607                 :     /* default option values: */
 4382 bruce                    2608 GIC         187 :     int         min_words = 15;
 4382 bruce                    2609 CBC         187 :     int         max_words = 35;
 4382 bruce                    2610 GIC         187 :     int         shortword = 3;
 5287 teodor                   2611 CBC         187 :     int         max_fragments = 0;
 1095 tgl                      2612             187 :     bool        highlightall = false;
 5287 teodor                   2613 EUB             :     ListCell   *l;
 5287 teodor                   2614 ECB             : 
                               2615                 :     /* Extract configuration option values */
 5287 teodor                   2616 GIC         187 :     prs->startsel = NULL;
                               2617             187 :     prs->stopsel = NULL;
 1095 tgl                      2618             187 :     prs->fragdelim = NULL;
 5287 teodor                   2619             364 :     foreach(l, prsoptions)
                               2620                 :     {
                               2621             177 :         DefElem    *defel = (DefElem *) lfirst(l);
 5287 teodor                   2622 CBC         177 :         char       *val = defGetString(defel);
                               2623                 : 
 5287 teodor                   2624 GIC         177 :         if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
 1722 andres                   2625              18 :             max_words = pg_strtoint32(val);
 5287 teodor                   2626             159 :         else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
 1722 andres                   2627              18 :             min_words = pg_strtoint32(val);
 5287 teodor                   2628             141 :         else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
 1722 andres                   2629 UIC           0 :             shortword = pg_strtoint32(val);
 5287 teodor                   2630 GIC         141 :         else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
 1722 andres                   2631 CBC          15 :             max_fragments = pg_strtoint32(val);
 5287 teodor                   2632             126 :         else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
                               2633              60 :             prs->startsel = pstrdup(val);
                               2634              66 :         else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
 5287 teodor                   2635 GIC          60 :             prs->stopsel = pstrdup(val);
 5287 teodor                   2636 CBC           6 :         else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
                               2637               3 :             prs->fragdelim = pstrdup(val);
                               2638               3 :         else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
 1095 tgl                      2639               9 :             highlightall = (pg_strcasecmp(val, "1") == 0 ||
 1095 tgl                      2640 GIC           6 :                             pg_strcasecmp(val, "on") == 0 ||
                               2641               3 :                             pg_strcasecmp(val, "true") == 0 ||
 1095 tgl                      2642 UIC           0 :                             pg_strcasecmp(val, "t") == 0 ||
 1095 tgl                      2643 GIC           6 :                             pg_strcasecmp(val, "y") == 0 ||
 1095 tgl                      2644 UIC           0 :                             pg_strcasecmp(val, "yes") == 0);
                               2645                 :         else
 5287 teodor                   2646               0 :             ereport(ERROR,
 5287 teodor                   2647 ECB             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               2648                 :                      errmsg("unrecognized headline parameter: \"%s\"",
                               2649                 :                             defel->defname)));
                               2650                 :     }
                               2651                 : 
                               2652                 :     /* in HighlightAll mode these parameters are ignored */
 1095 tgl                      2653 GIC         187 :     if (!highlightall)
                               2654                 :     {
 5287 teodor                   2655 CBC         184 :         if (min_words >= max_words)
 5287 teodor                   2656 LBC           0 :             ereport(ERROR,
                               2657                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               2658                 :                      errmsg("MinWords should be less than MaxWords")));
 5287 teodor                   2659 CBC         184 :         if (min_words <= 0)
 5287 teodor                   2660 LBC           0 :             ereport(ERROR,
                               2661                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               2662                 :                      errmsg("MinWords should be positive")));
 5287 teodor                   2663 GIC         184 :         if (shortword < 0)
 5287 teodor                   2664 UIC           0 :             ereport(ERROR,
                               2665                 :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 5287 teodor                   2666 ECB             :                      errmsg("ShortWord should be >= 0")));
 5287 teodor                   2667 GIC         184 :         if (max_fragments < 0)
 5287 teodor                   2668 LBC           0 :             ereport(ERROR,
 5287 teodor                   2669 ECB             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                               2670                 :                      errmsg("MaxFragments should be >= 0")));
                               2671                 :     }
                               2672                 : 
                               2673                 :     /* Locate words and phrases matching the query */
    3 tgl                      2674 GNC         187 :     if (query->size > 0)
                               2675                 :     {
                               2676                 :         hlCheck     ch;
                               2677                 : 
                               2678             181 :         ch.words = prs->words;
                               2679             181 :         ch.len = prs->curwords;
                               2680             181 :         locations = TS_execute_locations(GETQUERY(query), &ch, TS_EXEC_EMPTY,
                               2681                 :                                          checkcondition_HL);
                               2682                 :     }
                               2683                 :     else
                               2684               6 :         locations = NIL;        /* empty query matches nothing */
                               2685                 : 
                               2686                 :     /* Apply appropriate headline selector */
 5287 teodor                   2687 CBC         187 :     if (max_fragments == 0)
   80 tgl                      2688 GNC         172 :         mark_hl_words(prs, query, locations, highlightall, shortword,
                               2689                 :                       min_words, max_words);
 5287 teodor                   2690 ECB             :     else
   80 tgl                      2691 GNC          15 :         mark_hl_fragments(prs, query, locations, highlightall, shortword,
                               2692                 :                           min_words, max_words, max_fragments);
                               2693                 : 
                               2694                 :     /* Fill in default values for string options */
 5710 tgl                      2695 CBC         187 :     if (!prs->startsel)
                               2696             127 :         prs->startsel = pstrdup("<b>");
                               2697             187 :     if (!prs->stopsel)
                               2698             127 :         prs->stopsel = pstrdup("</b>");
 5287 teodor                   2699 GIC         187 :     if (!prs->fragdelim)
 5287 teodor                   2700 CBC         184 :         prs->fragdelim = pstrdup(" ... ");
 1095 tgl                      2701 ECB             : 
                               2702                 :     /* Caller will need these lengths, too */
 5710 tgl                      2703 CBC         187 :     prs->startsellen = strlen(prs->startsel);
                               2704             187 :     prs->stopsellen = strlen(prs->stopsel);
 5287 teodor                   2705             187 :     prs->fragdelimlen = strlen(prs->fragdelim);
 5710 tgl                      2706 ECB             : 
 5710 tgl                      2707 CBC         187 :     PG_RETURN_POINTER(prs);
 5710 tgl                      2708 EUB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a