LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 87.6 % 145 127 2 7 7 2 7 63 4 53 9 65 2
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 5 5 4 1 5
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * tsvector_parser.c
       4                 :  *    Parser for tsvector
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  *
       9                 :  * IDENTIFICATION
      10                 :  *    src/backend/utils/adt/tsvector_parser.c
      11                 :  *
      12                 :  *-------------------------------------------------------------------------
      13                 :  */
      14                 : 
      15                 : #include "postgres.h"
      16                 : 
      17                 : #include "tsearch/ts_locale.h"
      18                 : #include "tsearch/ts_utils.h"
      19                 : 
      20                 : 
      21                 : /*
      22                 :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23                 :  * parse its input, hence the boolean flags.  The oprisdelim and is_tsquery
      24                 :  * flags are both true or both false in current usage, but we keep them
      25                 :  * separate for clarity.
      26                 :  *
      27                 :  * If oprisdelim is set, the following characters are treated as delimiters
      28                 :  * (in addition to whitespace): ! | & ( )
      29                 :  *
      30                 :  * is_tsquery affects *only* the content of error messages.
      31                 :  *
      32                 :  * is_web can be true to further modify tsquery parsing.
      33                 :  *
      34                 :  * If escontext is an ErrorSaveContext node, then soft errors can be
      35                 :  * captured there rather than being thrown.
      36                 :  */
      37                 : struct TSVectorParseStateData
      38                 : {
      39                 :     char       *prsbuf;         /* next input character */
      40                 :     char       *bufstart;       /* whole string (used only for errors) */
      41                 :     char       *word;           /* buffer to hold the current word */
      42                 :     int         len;            /* size in bytes allocated for 'word' */
      43                 :     int         eml;            /* max bytes per character */
      44                 :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      45                 :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      46                 :     bool        is_web;         /* we're in websearch_to_tsquery() */
      47                 :     Node       *escontext;      /* for soft error reporting */
      48                 : };
      49                 : 
      50                 : 
      51                 : /*
      52                 :  * Initializes a parser state object for the given input string.
      53                 :  * A bitmask of flags (see ts_utils.h) and an error context object
      54                 :  * can be provided as well.
      55                 :  */
      56                 : TSVectorParseState
      57 GNC        3816 : init_tsvector_parser(char *input, int flags, Node *escontext)
      58                 : {
      59                 :     TSVectorParseState state;
      60                 : 
      61 GIC        3816 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
      62            3816 :     state->prsbuf = input;
      63            3816 :     state->bufstart = input;
      64            3816 :     state->len = 32;
      65            3816 :     state->word = (char *) palloc(state->len);
      66            3816 :     state->eml = pg_database_encoding_max_length();
      67            3816 :     state->oprisdelim = (flags & P_TSV_OPR_IS_DELIM) != 0;
      68 CBC        3816 :     state->is_tsquery = (flags & P_TSV_IS_TSQUERY) != 0;
      69 GIC        3816 :     state->is_web = (flags & P_TSV_IS_WEB) != 0;
      70 GNC        3816 :     state->escontext = escontext;
      71                 : 
      72 GIC        3816 :     return state;
      73 ECB             : }
      74                 : 
      75                 : /*
      76                 :  * Reinitializes parser to parse 'input', instead of previous input.
      77                 :  *
      78                 :  * Note that bufstart (the string reported in errors) is not changed.
      79                 :  */
      80                 : void
      81 CBC        4065 : reset_tsvector_parser(TSVectorParseState state, char *input)
      82 ECB             : {
      83 CBC        4065 :     state->prsbuf = input;
      84            4065 : }
      85                 : 
      86 ECB             : /*
      87                 :  * Shuts down a tsvector parser.
      88                 :  */
      89                 : void
      90 GIC        3813 : close_tsvector_parser(TSVectorParseState state)
      91                 : {
      92            3813 :     pfree(state->word);
      93            3813 :     pfree(state);
      94            3813 : }
      95 ECB             : 
      96                 : /* increase the size of 'word' if needed to hold one more character */
      97                 : #define RESIZEPRSBUF \
      98                 : do { \
      99                 :     int clen = curpos - state->word; \
     100                 :     if ( clen + state->eml >= state->len ) \
     101                 :     { \
     102                 :         state->len *= 2; \
     103                 :         state->word = (char *) repalloc(state->word, state->len); \
     104                 :         curpos = state->word + clen; \
     105                 :     } \
     106                 : } while (0)
     107                 : 
     108                 : /* Fills gettoken_tsvector's output parameters, and returns true */
     109                 : #define RETURN_TOKEN \
     110                 : do { \
     111                 :     if (pos_ptr != NULL) \
     112                 :     { \
     113                 :         *pos_ptr = pos; \
     114                 :         *poslen = npos; \
     115                 :     } \
     116                 :     else if (pos != NULL) \
     117                 :         pfree(pos); \
     118                 :     \
     119                 :     if (strval != NULL) \
     120                 :         *strval = state->word; \
     121                 :     if (lenval != NULL) \
     122                 :         *lenval = curpos - state->word; \
     123                 :     if (endptr != NULL) \
     124                 :         *endptr = state->prsbuf; \
     125                 :     return true; \
     126                 : } while(0)
     127                 : 
     128                 : 
     129                 : /* State codes used in gettoken_tsvector */
     130                 : #define WAITWORD        1
     131                 : #define WAITENDWORD     2
     132                 : #define WAITNEXTCHAR    3
     133                 : #define WAITENDCMPLX    4
     134                 : #define WAITPOSINFO     5
     135                 : #define INPOSINFO       6
     136                 : #define WAITPOSDELIM    7
     137                 : #define WAITCHARCMPLX   8
     138                 : 
     139                 : #define PRSSYNTAXERROR return prssyntaxerror(state)
     140                 : 
     141                 : static bool
     142 GIC           9 : prssyntaxerror(TSVectorParseState state)
     143                 : {
     144 GNC           9 :     errsave(state->escontext,
     145                 :             (errcode(ERRCODE_SYNTAX_ERROR),
     146                 :              state->is_tsquery ?
     147                 :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     148                 :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     149                 :     /* In soft error situation, return false as convenience for caller */
     150               6 :     return false;
     151                 : }
     152                 : 
     153                 : 
     154                 : /*
     155                 :  * Get next token from string being parsed. Returns true if successful,
     156                 :  * false if end of input string is reached or soft error.
     157                 :  *
     158                 :  * On success, these output parameters are filled in:
     159 ECB             :  *
     160                 :  * *strval      pointer to token
     161                 :  * *lenval      length of *strval
     162                 :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     163                 :  *              associated with the token. If the caller is not interested
     164                 :  *              in the information, NULL can be supplied. Otherwise
     165                 :  *              the caller is responsible for pfreeing the array.
     166                 :  * *poslen      number of elements in *pos_ptr
     167                 :  * *endptr      scan resumption point
     168                 :  *
     169                 :  * Pass NULL for any unwanted output parameters.
     170                 :  *
     171                 :  * If state->escontext is an ErrorSaveContext, then caller must check
     172                 :  * SOFT_ERROR_OCCURRED() to determine whether a "false" result means
     173                 :  * error or normal end-of-string.
     174                 :  */
     175                 : bool
     176 GIC       96365 : gettoken_tsvector(TSVectorParseState state,
     177                 :                   char **strval, int *lenval,
     178                 :                   WordEntryPos **pos_ptr, int *poslen,
     179                 :                   char **endptr)
     180                 : {
     181           96365 :     int         oldstate = 0;
     182           96365 :     char       *curpos = state->word;
     183           96365 :     int         statecode = WAITWORD;
     184                 : 
     185                 :     /*
     186                 :      * pos is for collecting the comma delimited list of positions followed by
     187                 :      * the actual token.
     188                 :      */
     189           96365 :     WordEntryPos *pos = NULL;
     190           96365 :     int         npos = 0;       /* elements of pos used */
     191           96365 :     int         posalen = 0;    /* allocated size of pos */
     192                 : 
     193                 :     while (1)
     194                 :     {
     195          392574 :         if (statecode == WAITWORD)
     196                 :         {
     197 CBC      185055 :             if (*(state->prsbuf) == '\0')
     198 GIC        1898 :                 return false;
     199          183157 :             else if (!state->is_web && t_iseq(state->prsbuf, '\''))
     200              81 :                 statecode = WAITENDCMPLX;
     201          183076 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     202 ECB             :             {
     203 CBC           3 :                 statecode = WAITNEXTCHAR;
     204               3 :                 oldstate = WAITENDWORD;
     205                 :             }
     206 GIC      183073 :             else if ((state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     207          183073 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     208 UIC           0 :                 PRSSYNTAXERROR;
     209 GIC      183073 :             else if (!t_isspace(state->prsbuf))
     210 ECB             :             {
     211 CBC       94383 :                 COPYCHAR(curpos, state->prsbuf);
     212           94383 :                 curpos += pg_mblen(state->prsbuf);
     213 GIC       94383 :                 statecode = WAITENDWORD;
     214                 :             }
     215                 :         }
     216 CBC      207519 :         else if (statecode == WAITNEXTCHAR)
     217                 :         {
     218              81 :             if (*(state->prsbuf) == '\0')
     219 UNC           0 :                 ereturn(state->escontext, false,
     220 ECB             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     221                 :                          errmsg("there is no escaped character: \"%s\"",
     222                 :                                 state->bufstart)));
     223                 :             else
     224                 :             {
     225 CBC          81 :                 RESIZEPRSBUF;
     226 GIC          81 :                 COPYCHAR(curpos, state->prsbuf);
     227 CBC          81 :                 curpos += pg_mblen(state->prsbuf);
     228              81 :                 Assert(oldstate != 0);
     229 GBC          81 :                 statecode = oldstate;
     230 ECB             :             }
     231                 :         }
     232 CBC      207438 :         else if (statecode == WAITENDWORD)
     233 ECB             :         {
     234 CBC      191625 :             if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     235                 :             {
     236 GIC          36 :                 statecode = WAITNEXTCHAR;
     237 CBC          36 :                 oldstate = WAITENDWORD;
     238                 :             }
     239          191589 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
     240 GBC      103404 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)) ||
     241 GIC      102516 :                      (state->is_web && t_iseq(state->prsbuf, '"')))
     242                 :             {
     243           89079 :                 RESIZEPRSBUF;
     244           89079 :                 if (curpos == state->word)
     245 UIC           0 :                     PRSSYNTAXERROR;
     246 CBC       89079 :                 *(curpos) = '\0';
     247           89079 :                 RETURN_TOKEN;
     248 ECB             :             }
     249 CBC      102510 :             else if (t_iseq(state->prsbuf, ':'))
     250 ECB             :             {
     251 GIC        5307 :                 if (curpos == state->word)
     252 UIC           0 :                     PRSSYNTAXERROR;
     253 CBC        5307 :                 *(curpos) = '\0';
     254 GIC        5307 :                 if (state->oprisdelim)
     255 CBC         348 :                     RETURN_TOKEN;
     256                 :                 else
     257            4959 :                     statecode = INPOSINFO;
     258 ECB             :             }
     259                 :             else
     260                 :             {
     261 CBC       97203 :                 RESIZEPRSBUF;
     262           97203 :                 COPYCHAR(curpos, state->prsbuf);
     263 GIC       97203 :                 curpos += pg_mblen(state->prsbuf);
     264 ECB             :             }
     265                 :         }
     266 GBC       15813 :         else if (statecode == WAITENDCMPLX)
     267 ECB             :         {
     268 CBC         462 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     269                 :             {
     270              81 :                 statecode = WAITCHARCMPLX;
     271                 :             }
     272             381 :             else if (!state->is_web && t_iseq(state->prsbuf, '\\'))
     273 EUB             :             {
     274 CBC          42 :                 statecode = WAITNEXTCHAR;
     275              42 :                 oldstate = WAITENDCMPLX;
     276 ECB             :             }
     277 GIC         339 :             else if (*(state->prsbuf) == '\0')
     278 LBC           0 :                 PRSSYNTAXERROR;
     279                 :             else
     280                 :             {
     281 GIC         339 :                 RESIZEPRSBUF;
     282 CBC         339 :                 COPYCHAR(curpos, state->prsbuf);
     283             339 :                 curpos += pg_mblen(state->prsbuf);
     284 ECB             :             }
     285                 :         }
     286 GIC       15351 :         else if (statecode == WAITCHARCMPLX)
     287 ECB             :         {
     288 GIC          81 :             if (!state->is_web && t_iseq(state->prsbuf, '\''))
     289 ECB             :             {
     290 UIC           0 :                 RESIZEPRSBUF;
     291 LBC           0 :                 COPYCHAR(curpos, state->prsbuf);
     292 UIC           0 :                 curpos += pg_mblen(state->prsbuf);
     293 LBC           0 :                 statecode = WAITENDCMPLX;
     294                 :             }
     295 ECB             :             else
     296                 :             {
     297 GIC          81 :                 RESIZEPRSBUF;
     298 CBC          81 :                 *(curpos) = '\0';
     299 GBC          81 :                 if (curpos == state->word)
     300 GIC           9 :                     PRSSYNTAXERROR;
     301              72 :                 if (state->oprisdelim)
     302 ECB             :                 {
     303                 :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
     304 CBC          33 :                     RETURN_TOKEN;
     305                 :                 }
     306                 :                 else
     307              39 :                     statecode = WAITPOSINFO;
     308 GIC          39 :                 continue;       /* recheck current character */
     309 ECB             :             }
     310                 :         }
     311 GBC       15270 :         else if (statecode == WAITPOSINFO)
     312 EUB             :         {
     313 GBC          39 :             if (t_iseq(state->prsbuf, ':'))
     314 UBC           0 :                 statecode = INPOSINFO;
     315                 :             else
     316 GIC          39 :                 RETURN_TOKEN;
     317                 :         }
     318 CBC       15231 :         else if (statecode == INPOSINFO)
     319 ECB             :         {
     320 CBC        5262 :             if (t_isdigit(state->prsbuf))
     321 ECB             :             {
     322 CBC        5262 :                 if (posalen == 0)
     323                 :                 {
     324 GIC        4959 :                     posalen = 4;
     325 CBC        4959 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
     326 GIC        4959 :                     npos = 0;
     327                 :                 }
     328 CBC         303 :                 else if (npos + 1 >= posalen)
     329 ECB             :                 {
     330 GIC          57 :                     posalen *= 2;
     331              57 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
     332 ECB             :                 }
     333 GIC        5262 :                 npos++;
     334 CBC        5262 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     335 EUB             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     336 GIC        5262 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     337 UNC           0 :                     ereturn(state->escontext, false,
     338                 :                             (errcode(ERRCODE_SYNTAX_ERROR),
     339 ECB             :                              errmsg("wrong position info in tsvector: \"%s\"",
     340                 :                                     state->bufstart)));
     341 CBC        5262 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     342 GIC        5262 :                 statecode = WAITPOSDELIM;
     343 ECB             :             }
     344                 :             else
     345 LBC           0 :                 PRSSYNTAXERROR;
     346 ECB             :         }
     347 CBC        9969 :         else if (statecode == WAITPOSDELIM)
     348                 :         {
     349            9969 :             if (t_iseq(state->prsbuf, ','))
     350 GIC         303 :                 statecode = INPOSINFO;
     351 CBC        9666 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     352 ECB             :             {
     353 GIC         210 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     354 LBC           0 :                     PRSSYNTAXERROR;
     355 CBC         210 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     356                 :             }
     357            9456 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     358 EUB             :             {
     359 GIC         108 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     360 UIC           0 :                     PRSSYNTAXERROR;
     361 GIC         108 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     362 ECB             :             }
     363 CBC        9348 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     364                 :             {
     365 GIC         138 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     366 UBC           0 :                     PRSSYNTAXERROR;
     367 GIC         138 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     368 ECB             :             }
     369 GIC        9210 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     370 ECB             :             {
     371 CBC          66 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     372 LBC           0 :                     PRSSYNTAXERROR;
     373 GIC          66 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     374 ECB             :             }
     375 GBC        9144 :             else if (t_isspace(state->prsbuf) ||
     376 CBC        4401 :                      *(state->prsbuf) == '\0')
     377 GIC        4959 :                 RETURN_TOKEN;
     378 CBC        4185 :             else if (!t_isdigit(state->prsbuf))
     379 UIC           0 :                 PRSSYNTAXERROR;
     380 ECB             :         }
     381 EUB             :         else                    /* internal error */
     382 LBC           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     383                 :                  statecode);
     384 ECB             : 
     385                 :         /* get next char */
     386 CBC      296170 :         state->prsbuf += pg_mblen(state->prsbuf);
     387 EUB             :     }
     388 ECB             : }
        

Generated by: LCOV version v1.16-55-g56c0a2a