LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 87.6 % 145 127 2 7 7 2 7 63 4 53 9 65 2
Current Date: 2023-04-08 17:13:01 Functions: 100.0 % 5 5 4 1 5
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (60,120] days: 66.7 % 6 4 2 4
Legend: Lines: hit not hit (240..) days: 88.5 % 139 123 7 7 2 7 63 53 6 57
Function coverage date bins:
(60,120] days: 100.0 % 1 1 1
(240..) days: 44.4 % 9 4 4 5

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * tsvector_parser.c
                                  4                 :  *    Parser for tsvector
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  *
                                  8                 :  *
                                  9                 :  * IDENTIFICATION
                                 10                 :  *    src/backend/utils/adt/tsvector_parser.c
                                 11                 :  *
                                 12                 :  *-------------------------------------------------------------------------
                                 13                 :  */
                                 14                 : 
                                 15                 : #include "postgres.h"
                                 16                 : 
                                 17                 : #include "tsearch/ts_locale.h"
                                 18                 : #include "tsearch/ts_utils.h"
                                 19                 : 
                                 20                 : 
                                 21                 : /*
                                 22                 :  * Private state of tsvector parser.  Note that tsquery also uses this code to
                                 23                 :  * parse its input, hence the boolean flags.  The oprisdelim and is_tsquery
                                 24                 :  * flags are both true or both false in current usage, but we keep them
                                 25                 :  * separate for clarity.
                                 26                 :  *
                                 27                 :  * If oprisdelim is set, the following characters are treated as delimiters
                                 28                 :  * (in addition to whitespace): ! | & ( )
                                 29                 :  *
                                 30                 :  * is_tsquery affects *only* the content of error messages.
                                 31                 :  *
                                 32                 :  * is_web can be true to further modify tsquery parsing.
                                 33                 :  *
                                 34                 :  * If escontext is an ErrorSaveContext node, then soft errors can be
                                 35                 :  * captured there rather than being thrown.
                                 36                 :  */
                                 37                 : struct TSVectorParseStateData
                                 38                 : {
                                 39                 :     char       *prsbuf;         /* next input character */
                                 40                 :     char       *bufstart;       /* whole string (used only for errors) */
                                 41                 :     char       *word;           /* buffer to hold the current word */
                                 42                 :     int         len;            /* size in bytes allocated for 'word' */
                                 43                 :     int         eml;            /* max bytes per character */
                                 44                 :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
                                 45                 :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
                                 46                 :     bool        is_web;         /* we're in websearch_to_tsquery() */
                                 47                 :     Node       *escontext;      /* for soft error reporting */
                                 48                 : };
                                 49                 : 
                                 50                 : 
                                 51                 : /*
                                 52                 :  * Initializes a parser state object for the given input string.
                                 53                 :  * A bitmask of flags (see ts_utils.h) and an error context object
                                 54                 :  * can be provided as well.
                                 55                 :  */
                                 56                 : TSVectorParseState
  103 tgl                        57 GNC        3816 : init_tsvector_parser(char *input, int flags, Node *escontext)
                                 58                 : {
                                 59                 :     TSVectorParseState state;
                                 60                 : 
 5693 teodor                     61 GIC        3816 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
                                 62            3816 :     state->prsbuf = input;
 5649 tgl                        63            3816 :     state->bufstart = input;
 5693 teodor                     64            3816 :     state->len = 32;
                                 65            3816 :     state->word = (char *) palloc(state->len);
 5649 tgl                        66            3816 :     state->eml = pg_database_encoding_max_length();
 1830 teodor                     67            3816 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
 1830 teodor                     68 CBC        3816 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
 1830 teodor                     69 GIC        3816 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
  103 tgl                        70 GNC        3816 :     state->escontext = escontext;
                                 71                 : 
 5693 teodor                     72 GIC        3816 :     return state;
 5693 teodor                     73 ECB             : }
                                 74                 : 
                                 75                 : /*
 5649 tgl                        76                 :  * Reinitializes parser to parse 'input', instead of previous input.
                                 77                 :  *
                                 78                 :  * Note that bufstart (the string reported in errors) is not changed.
 5693 teodor                     79                 :  */
                                 80                 : void
 5693 teodor                     81 CBC        4065 : reset_tsvector_parser(TSVectorParseState state, char *input)
 5693 teodor                     82 ECB             : {
 5624 bruce                      83 CBC        4065 :     state->prsbuf = input;
 5693 teodor                     84            4065 : }
                                 85                 : 
 5693 teodor                     86 ECB             : /*
                                 87                 :  * Shuts down a tsvector parser.
                                 88                 :  */
                                 89                 : void
 5693 teodor                     90 GIC        3813 : close_tsvector_parser(TSVectorParseState state)
                                 91                 : {
                                 92            3813 :     pfree(state->word);
                                 93            3813 :     pfree(state);
                                 94            3813 : }
 5693 teodor                     95 ECB             : 
                                 96                 : /* increase the size of 'word' if needed to hold one more character */
                                 97                 : #define RESIZEPRSBUF \
                                 98                 : do { \
                                 99                 :     int clen = curpos - state->word; \
                                100                 :     if ( clen + state->eml >= state->len ) \
                                101                 :     { \
                                102                 :         state->len *= 2; \
                                103                 :         state->word = (char *) repalloc(state->word, state->len); \
                                104                 :         curpos = state->word + clen; \
                                105                 :     } \
                                106                 : } while (0)
                                107                 : 
 5649 tgl                       108                 : /* Fills gettoken_tsvector's output parameters, and returns true */
                                109                 : #define RETURN_TOKEN \
                                110                 : do { \
                                111                 :     if (pos_ptr != NULL) \
                                112                 :     { \
                                113                 :         *pos_ptr = pos; \
                                114                 :         *poslen = npos; \
                                115                 :     } \
                                116                 :     else if (pos != NULL) \
                                117                 :         pfree(pos); \
                                118                 :     \
                                119                 :     if (strval != NULL) \
                                120                 :         *strval = state->word; \
                                121                 :     if (lenval != NULL) \
                                122                 :         *lenval = curpos - state->word; \
                                123                 :     if (endptr != NULL) \
                                124                 :         *endptr = state->prsbuf; \
                                125                 :     return true; \
                                126                 : } while(0)
                                127                 : 
                                128                 : 
                                129                 : /* State codes used in gettoken_tsvector */
                                130                 : #define WAITWORD        1
                                131                 : #define WAITENDWORD     2
                                132                 : #define WAITNEXTCHAR    3
                                133                 : #define WAITENDCMPLX    4
                                134                 : #define WAITPOSINFO     5
                                135                 : #define INPOSINFO       6
                                136                 : #define WAITPOSDELIM    7
                                137                 : #define WAITCHARCMPLX   8
                                138                 : 
                                139                 : #define PRSSYNTAXERROR return prssyntaxerror(state)
                                140                 : 
                                141                 : static bool
 5649 tgl                       142 GIC           9 : prssyntaxerror(TSVectorParseState state)
                                143                 : {
  103 tgl                       144 GNC           9 :     errsave(state->escontext,
                                145                 :             (errcode(ERRCODE_SYNTAX_ERROR),
                                146                 :              state->is_tsquery ?
                                147                 :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
                                148                 :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
                                149                 :     /* In soft error situation, return false as convenience for caller */
                                150               6 :     return false;
                                151                 : }
                                152                 : 
                                153                 : 
                                154                 : /*
                                155                 :  * Get next token from string being parsed. Returns true if successful,
                                156                 :  * false if end of input string is reached or soft error.
                                157                 :  *
                                158                 :  * On success, these output parameters are filled in:
 5624 bruce                     159 ECB             :  *
                                160                 :  * *strval      pointer to token
                                161                 :  * *lenval      length of *strval
                                162                 :  * *pos_ptr     pointer to a palloc'd array of positions and weights
                                163                 :  *              associated with the token. If the caller is not interested
                                164                 :  *              in the information, NULL can be supplied. Otherwise
                                165                 :  *              the caller is responsible for pfreeing the array.
                                166                 :  * *poslen      number of elements in *pos_ptr
 5649 tgl                       167                 :  * *endptr      scan resumption point
                                168                 :  *
                                169                 :  * Pass NULL for any unwanted output parameters.
                                170                 :  *
                                171                 :  * If state->escontext is an ErrorSaveContext, then caller must check
                                172                 :  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
                                173                 :  * error or normal end-of-string.
                                174                 :  */
                                175                 : bool
 5624 bruce                     176 GIC       96365 : gettoken_tsvector(TSVectorParseState state,
                                177                 :                   char **strval, int *lenval,
                                178                 :                   WordEntryPos **pos_ptr, int *poslen,
                                179                 :                   char **endptr)
                                180                 : {
                                181           96365 :     int         oldstate = 0;
                                182           96365 :     char       *curpos = state->word;
                                183           96365 :     int         statecode = WAITWORD;
                                184                 : 
                                185                 :     /*
                                186                 :      * pos is for collecting the comma delimited list of positions followed by
                                187                 :      * the actual token.
                                188                 :      */
 5693 teodor                    189           96365 :     WordEntryPos *pos = NULL;
 5624 bruce                     190           96365 :     int         npos = 0;       /* elements of pos used */
                                191           96365 :     int         posalen = 0;    /* allocated size of pos */
                                192                 : 
                                193                 :     while (1)
                                194                 :     {
 5693 teodor                    195          392574 :         if (statecode == WAITWORD)
                                196                 :         {
 5693 teodor                    197 CBC      185055 :             if (*(state->prsbuf) == '\0')
 5693 teodor                    198 GIC        1898 :                 return false;
 1830                           199          183157 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
 5693                           200              81 :                 statecode = WAITENDCMPLX;
 1830                           201          183076 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
 5693 teodor                    202 ECB             :             {
 5693 teodor                    203 CBC           3 :                 statecode = WAITNEXTCHAR;
                                204               3 :                 oldstate = WAITENDWORD;
                                205                 :             }
 1830 teodor                    206 GIC      183073 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
                                207          183073 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
 5649 tgl                       208 UIC           0 :                 PRSSYNTAXERROR;
 5693 teodor                    209 GIC      183073 :             else if (!t_isspace(state->prsbuf))
 5693 teodor                    210 ECB             :             {
 5693 teodor                    211 CBC       94383 :                 COPYCHAR(curpos, state->prsbuf);
                                212           94383 :                 curpos += pg_mblen(state->prsbuf);
 5693 teodor                    213 GIC       94383 :                 statecode = WAITENDWORD;
                                214                 :             }
                                215                 :         }
 5693 teodor                    216 CBC      207519 :         else if (statecode == WAITNEXTCHAR)
                                217                 :         {
                                218              81 :             if (*(state->prsbuf) == '\0')
  103 tgl                       219 UNC           0 :                 ereturn(state->escontext, false,
 5693 teodor                    220 ECB             :                         (errcode(ERRCODE_SYNTAX_ERROR),
 5649 tgl                       221                 :                          errmsg("there is no escaped character: \"%s\"",
                                222                 :                                 state->bufstart)));
                                223                 :             else
 5693 teodor                    224                 :             {
 5693 teodor                    225 CBC          81 :                 RESIZEPRSBUF;
 5693 teodor                    226 GIC          81 :                 COPYCHAR(curpos, state->prsbuf);
 5693 teodor                    227 CBC          81 :                 curpos += pg_mblen(state->prsbuf);
                                228              81 :                 Assert(oldstate != 0);
 5693 teodor                    229 GBC          81 :                 statecode = oldstate;
 5693 teodor                    230 ECB             :             }
                                231                 :         }
 5693 teodor                    232 CBC      207438 :         else if (statecode == WAITENDWORD)
 5693 teodor                    233 ECB             :         {
 1830 teodor                    234 CBC      191625 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
                                235                 :             {
 5693 teodor                    236 GIC          36 :                 statecode = WAITNEXTCHAR;
 5693 teodor                    237 CBC          36 :                 oldstate = WAITENDWORD;
                                238                 :             }
                                239          191589 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
 1830 teodor                    240 GBC      103404 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
 1830 teodor                    241 GIC      102516 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
                                242                 :             {
 5693                           243           89079 :                 RESIZEPRSBUF;
                                244           89079 :                 if (curpos == state->word)
 5649 tgl                       245 UIC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    246 CBC       89079 :                 *(curpos) = '\0';
                                247           89079 :                 RETURN_TOKEN;
 5693 teodor                    248 ECB             :             }
 5693 teodor                    249 CBC      102510 :             else if (t_iseq(state->prsbuf, ':'))
 5693 teodor                    250 ECB             :             {
 5693 teodor                    251 GIC        5307 :                 if (curpos == state->word)
 5649 tgl                       252 UIC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    253 CBC        5307 :                 *(curpos) = '\0';
 5693 teodor                    254 GIC        5307 :                 if (state->oprisdelim)
 5693 teodor                    255 CBC         348 :                     RETURN_TOKEN;
                                256                 :                 else
                                257            4959 :                     statecode = INPOSINFO;
 5693 teodor                    258 ECB             :             }
                                259                 :             else
                                260                 :             {
 5693 teodor                    261 CBC       97203 :                 RESIZEPRSBUF;
                                262           97203 :                 COPYCHAR(curpos, state->prsbuf);
 5693 teodor                    263 GIC       97203 :                 curpos += pg_mblen(state->prsbuf);
 5693 teodor                    264 ECB             :             }
                                265                 :         }
 5693 teodor                    266 GBC       15813 :         else if (statecode == WAITENDCMPLX)
 5693 teodor                    267 ECB             :         {
 1830 teodor                    268 CBC         462 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
                                269                 :             {
 5693                           270              81 :                 statecode = WAITCHARCMPLX;
                                271                 :             }
 1830                           272             381 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
 5693 teodor                    273 EUB             :             {
 5693 teodor                    274 CBC          42 :                 statecode = WAITNEXTCHAR;
                                275              42 :                 oldstate = WAITENDCMPLX;
 5693 teodor                    276 ECB             :             }
 5693 teodor                    277 GIC         339 :             else if (*(state->prsbuf) == '\0')
 5649 tgl                       278 LBC           0 :                 PRSSYNTAXERROR;
                                279                 :             else
                                280                 :             {
 5693 teodor                    281 GIC         339 :                 RESIZEPRSBUF;
 5693 teodor                    282 CBC         339 :                 COPYCHAR(curpos, state->prsbuf);
                                283             339 :                 curpos += pg_mblen(state->prsbuf);
 5693 teodor                    284 ECB             :             }
                                285                 :         }
 5693 teodor                    286 GIC       15351 :         else if (statecode == WAITCHARCMPLX)
 5693 teodor                    287 ECB             :         {
 1830 teodor                    288 GIC          81 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
 5693 teodor                    289 ECB             :             {
 5693 teodor                    290 UIC           0 :                 RESIZEPRSBUF;
 5693 teodor                    291 LBC           0 :                 COPYCHAR(curpos, state->prsbuf);
 5693 teodor                    292 UIC           0 :                 curpos += pg_mblen(state->prsbuf);
 5693 teodor                    293 LBC           0 :                 statecode = WAITENDCMPLX;
                                294                 :             }
 5693 teodor                    295 ECB             :             else
                                296                 :             {
 5693 teodor                    297 GIC          81 :                 RESIZEPRSBUF;
 5693 teodor                    298 CBC          81 :                 *(curpos) = '\0';
 5693 teodor                    299 GBC          81 :                 if (curpos == state->word)
 5649 tgl                       300 GIC           9 :                     PRSSYNTAXERROR;
 5693 teodor                    301              72 :                 if (state->oprisdelim)
 5693 teodor                    302 ECB             :                 {
                                303                 :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
 5693 teodor                    304 CBC          33 :                     RETURN_TOKEN;
                                305                 :                 }
                                306                 :                 else
                                307              39 :                     statecode = WAITPOSINFO;
 5693 teodor                    308 GIC          39 :                 continue;       /* recheck current character */
 5693 teodor                    309 ECB             :             }
                                310                 :         }
 5693 teodor                    311 GBC       15270 :         else if (statecode == WAITPOSINFO)
 5693 teodor                    312 EUB             :         {
 5693 teodor                    313 GBC          39 :             if (t_iseq(state->prsbuf, ':'))
 5693 teodor                    314 UBC           0 :                 statecode = INPOSINFO;
                                315                 :             else
 5693 teodor                    316 GIC          39 :                 RETURN_TOKEN;
                                317                 :         }
 5693 teodor                    318 CBC       15231 :         else if (statecode == INPOSINFO)
 5693 teodor                    319 ECB             :         {
 5693 teodor                    320 CBC        5262 :             if (t_isdigit(state->prsbuf))
 5693 teodor                    321 ECB             :             {
 5693 teodor                    322 CBC        5262 :                 if (posalen == 0)
                                323                 :                 {
 5693 teodor                    324 GIC        4959 :                     posalen = 4;
 5693 teodor                    325 CBC        4959 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
 5693 teodor                    326 GIC        4959 :                     npos = 0;
                                327                 :                 }
 5693 teodor                    328 CBC         303 :                 else if (npos + 1 >= posalen)
 5693 teodor                    329 ECB             :                 {
 5693 teodor                    330 GIC          57 :                     posalen *= 2;
                                331              57 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
 5693 teodor                    332 ECB             :                 }
 5693 teodor                    333 GIC        5262 :                 npos++;
 5693 teodor                    334 CBC        5262 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
 5649 tgl                       335 EUB             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
 5693 teodor                    336 GIC        5262 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
  103 tgl                       337 UNC           0 :                     ereturn(state->escontext, false,
                                338                 :                             (errcode(ERRCODE_SYNTAX_ERROR),
 5649 tgl                       339 ECB             :                              errmsg("wrong position info in tsvector: \"%s\"",
                                340                 :                                     state->bufstart)));
 5693 teodor                    341 CBC        5262 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
 5693 teodor                    342 GIC        5262 :                 statecode = WAITPOSDELIM;
 5693 teodor                    343 ECB             :             }
                                344                 :             else
 5649 tgl                       345 LBC           0 :                 PRSSYNTAXERROR;
 5693 teodor                    346 ECB             :         }
 5693 teodor                    347 CBC        9969 :         else if (statecode == WAITPOSDELIM)
                                348                 :         {
                                349            9969 :             if (t_iseq(state->prsbuf, ','))
 5693 teodor                    350 GIC         303 :                 statecode = INPOSINFO;
 5693 teodor                    351 CBC        9666 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
 5693 teodor                    352 ECB             :             {
 5693 teodor                    353 GIC         210 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
 5649 tgl                       354 LBC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    355 CBC         210 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
                                356                 :             }
                                357            9456 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
 5693 teodor                    358 EUB             :             {
 5693 teodor                    359 GIC         108 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
 5649 tgl                       360 UIC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    361 GIC         108 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
 5693 teodor                    362 ECB             :             }
 5693 teodor                    363 CBC        9348 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
                                364                 :             {
 5693 teodor                    365 GIC         138 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
 5649 tgl                       366 UBC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    367 GIC         138 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
 5693 teodor                    368 ECB             :             }
 5693 teodor                    369 GIC        9210 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
 5693 teodor                    370 ECB             :             {
 5693 teodor                    371 CBC          66 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
 5649 tgl                       372 LBC           0 :                     PRSSYNTAXERROR;
 5693 teodor                    373 GIC          66 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
 5693 teodor                    374 ECB             :             }
 5693 teodor                    375 GBC        9144 :             else if (t_isspace(state->prsbuf) ||
 5693 teodor                    376 CBC        4401 :                      *(state->prsbuf) == '\0')
 5693 teodor                    377 GIC        4959 :                 RETURN_TOKEN;
 5693 teodor                    378 CBC        4185 :             else if (!t_isdigit(state->prsbuf))
 5649 tgl                       379 UIC           0 :                 PRSSYNTAXERROR;
 5693 teodor                    380 ECB             :         }
 2118 tgl                       381 EUB             :         else                    /* internal error */
 5611 tgl                       382 LBC           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
                                383                 :                  statecode);
 5693 teodor                    384 ECB             : 
                                385                 :         /* get next char */
 5693 teodor                    386 CBC      296170 :         state->prsbuf += pg_mblen(state->prsbuf);
 5693 teodor                    387 EUB             :     }
 5693 teodor                    388 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a