LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - like_match.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 87.2 % 86 75 11 75
Current Date: 2023-04-08 15:15:32 Functions: 66.7 % 6 4 2 4
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * like_match.c
       4                 :  *    LIKE pattern matching internal code.
       5                 :  *
       6                 :  * This file is included by like.c four times, to provide matching code for
       7                 :  * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
       8                 :  * and (4) case insensitive matches in single-byte encodings.
       9                 :  * (UTF8 is a special case because we can use a much more efficient version
      10                 :  * of NextChar than can be used for general multi-byte encodings.)
      11                 :  *
      12                 :  * Before the inclusion, we need to define the following macros:
      13                 :  *
      14                 :  * NextChar
      15                 :  * MatchText - to name of function wanted
      16                 :  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
      17                 :  * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
      18                 :  *
      19                 :  * Copyright (c) 1996-2023, PostgreSQL Global Development Group
      20                 :  *
      21                 :  * IDENTIFICATION
      22                 :  *  src/backend/utils/adt/like_match.c
      23                 :  *
      24                 :  *-------------------------------------------------------------------------
      25                 :  */
      26                 : 
      27                 : /*
      28                 :  *  Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
      29                 :  *  Rich $alz is now <rsalz@bbn.com>.
      30                 :  *  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the
      31                 :  *  LIKE_ABORT code.
      32                 :  *
      33                 :  *  This code was shamelessly stolen from the "pql" code by myself and
      34                 :  *  slightly modified :)
      35                 :  *
      36                 :  *  All references to the word "star" were replaced by "percent"
      37                 :  *  All references to the word "wild" were replaced by "like"
      38                 :  *
      39                 :  *  All the nice shell RE matching stuff was replaced by just "_" and "%"
      40                 :  *
      41                 :  *  As I don't have a copy of the SQL standard handy I wasn't sure whether
      42                 :  *  to leave in the '\' escape character handling.
      43                 :  *
      44                 :  *  Keith Parks. <keith@mtcc.demon.co.uk>
      45                 :  *
      46                 :  *  SQL lets you specify the escape character by saying
      47                 :  *  LIKE <pattern> ESCAPE <escape character>. We are a small operation
      48                 :  *  so we force you to use '\'. - ay 7/95
      49                 :  *
      50                 :  *  Now we have the like_escape() function that converts patterns with
      51                 :  *  any specified escape character (or none at all) to the internal
      52                 :  *  default escape character, which is still '\'. - tgl 9/2000
      53                 :  *
      54                 :  * The code is rewritten to avoid requiring null-terminated strings,
      55                 :  * which in turn allows us to leave out some memcpy() operations.
      56                 :  * This code should be faster and take less memory, but no promises...
      57                 :  * - thomas 2000-08-06
      58                 :  */
      59                 : 
      60                 : 
      61                 : /*--------------------
      62                 :  *  Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
      63                 :  *
      64                 :  *  LIKE_TRUE: they match
      65                 :  *  LIKE_FALSE: they don't match
      66                 :  *  LIKE_ABORT: not only don't they match, but the text is too short.
      67                 :  *
      68                 :  * If LIKE_ABORT is returned, then no suffix of the text can match the
      69                 :  * pattern either, so an upper-level % scan can stop scanning now.
      70                 :  *--------------------
      71                 :  */
      72                 : 
      73                 : #ifdef MATCH_LOWER
      74                 : #define GETCHAR(t) MATCH_LOWER(t)
      75                 : #else
      76                 : #define GETCHAR(t) (t)
      77                 : #endif
      78                 : 
      79                 : static int
      80 CBC     1510654 : MatchText(const char *t, int tlen, const char *p, int plen,
      81                 :           pg_locale_t locale, bool locale_is_c)
      82                 : {
      83                 :     /* Fast path for match-everything pattern */
      84         1510654 :     if (plen == 1 && *p == '%')
      85             401 :         return LIKE_TRUE;
      86                 : 
      87                 :     /* Since this function recurses, it could be driven to stack overflow */
      88         1510253 :     check_stack_depth();
      89                 : 
      90                 :     /*
      91                 :      * In this loop, we advance by char when matching wildcards (and thus on
      92                 :      * recursive entry to this function we are properly char-synced). On other
      93                 :      * occasions it is safe to advance by byte, as the text and pattern will
      94                 :      * be in lockstep. This allows us to perform all comparisons between the
      95                 :      * text and pattern on a byte by byte basis, even for multi-byte
      96                 :      * encodings.
      97                 :      */
      98         1854987 :     while (tlen > 0 && plen > 0)
      99                 :     {
     100         1849751 :         if (*p == '\\')
     101                 :         {
     102                 :             /* Next pattern byte must match literally, whatever it is */
     103            6100 :             NextByte(p, plen);
     104                 :             /* ... and there had better be one, per SQL standard */
     105            6100 :             if (plen <= 0)
     106 UBC           0 :                 ereport(ERROR,
     107                 :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     108                 :                          errmsg("LIKE pattern must not end with escape character")));
     109 CBC        6100 :             if (GETCHAR(*p) != GETCHAR(*t))
     110            1654 :                 return LIKE_FALSE;
     111                 :         }
     112         1843651 :         else if (*p == '%')
     113                 :         {
     114                 :             char        firstpat;
     115                 : 
     116                 :             /*
     117                 :              * % processing is essentially a search for a text position at
     118                 :              * which the remainder of the text matches the remainder of the
     119                 :              * pattern, using a recursive call to check each potential match.
     120                 :              *
     121                 :              * If there are wildcards immediately following the %, we can skip
     122                 :              * over them first, using the idea that any sequence of N _'s and
     123                 :              * one or more %'s is equivalent to N _'s and one % (ie, it will
     124                 :              * match any sequence of at least N text characters).  In this way
     125                 :              * we will always run the recursive search loop using a pattern
     126                 :              * fragment that begins with a literal character-to-match, thereby
     127                 :              * not recursing more than we have to.
     128                 :              */
     129           84381 :             NextByte(p, plen);
     130                 : 
     131           84663 :             while (plen > 0)
     132                 :             {
     133           65911 :                 if (*p == '%')
     134               3 :                     NextByte(p, plen);
     135           65908 :                 else if (*p == '_')
     136                 :                 {
     137                 :                     /* If not enough text left to match the pattern, ABORT */
     138             282 :                     if (tlen <= 0)
     139               3 :                         return LIKE_ABORT;
     140             279 :                     NextChar(t, tlen);
     141             279 :                     NextByte(p, plen);
     142                 :                 }
     143                 :                 else
     144           65626 :                     break;      /* Reached a non-wildcard pattern char */
     145                 :             }
     146                 : 
     147                 :             /*
     148                 :              * If we're at end of pattern, match: we have a trailing % which
     149                 :              * matches any remaining text string.
     150                 :              */
     151           84378 :             if (plen <= 0)
     152           18752 :                 return LIKE_TRUE;
     153                 : 
     154                 :             /*
     155                 :              * Otherwise, scan for a text position at which we can match the
     156                 :              * rest of the pattern.  The first remaining pattern char is known
     157                 :              * to be a regular or escaped literal character, so we can compare
     158                 :              * the first pattern byte to each text byte to avoid recursing
     159                 :              * more than we have to.  This fact also guarantees that we don't
     160                 :              * have to consider a match to the zero-length substring at the
     161                 :              * end of the text.
     162                 :              */
     163           65626 :             if (*p == '\\')
     164                 :             {
     165               2 :                 if (plen < 2)
     166 UBC           0 :                     ereport(ERROR,
     167                 :                             (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     168                 :                              errmsg("LIKE pattern must not end with escape character")));
     169 CBC           2 :                 firstpat = GETCHAR(p[1]);
     170                 :             }
     171                 :             else
     172           65624 :                 firstpat = GETCHAR(*p);
     173                 : 
     174         1911480 :             while (tlen > 0)
     175                 :             {
     176         1860312 :                 if (GETCHAR(*t) == firstpat)
     177                 :                 {
     178           53540 :                     int         matched = MatchText(t, tlen, p, plen,
     179                 :                                                     locale, locale_is_c);
     180                 : 
     181           53540 :                     if (matched != LIKE_FALSE)
     182           14458 :                         return matched; /* TRUE or ABORT */
     183                 :                 }
     184                 : 
     185         1845878 :                 NextChar(t, tlen);
     186                 :             }
     187                 : 
     188                 :             /*
     189                 :              * End of text with no match, so no point in trying later places
     190                 :              * to start matching this pattern.
     191                 :              */
     192           51168 :             return LIKE_ABORT;
     193                 :         }
     194         1759270 :         else if (*p == '_')
     195                 :         {
     196                 :             /* _ matches any single character, and we know there is one */
     197            4931 :             NextChar(t, tlen);
     198            4931 :             NextByte(p, plen);
     199            4931 :             continue;
     200                 :         }
     201         1754339 :         else if (GETCHAR(*p) != GETCHAR(*t))
     202                 :         {
     203                 :             /* non-wildcard pattern char fails to match text char */
     204         1418982 :             return LIKE_FALSE;
     205                 :         }
     206                 : 
     207                 :         /*
     208                 :          * Pattern and text match, so advance.
     209                 :          *
     210                 :          * It is safe to use NextByte instead of NextChar here, even for
     211                 :          * multi-byte character sets, because we are not following immediately
     212                 :          * after a wildcard character. If we are in the middle of a multibyte
     213                 :          * character, we must already have matched at least one byte of the
     214                 :          * character from both text and pattern; so we cannot get out-of-sync
     215                 :          * on character boundaries.  And we know that no backend-legal
     216                 :          * encoding allows ASCII characters such as '%' to appear as non-first
     217                 :          * bytes of characters, so we won't mistakenly detect a new wildcard.
     218                 :          */
     219          339803 :         NextByte(t, tlen);
     220          339803 :         NextByte(p, plen);
     221                 :     }
     222                 : 
     223            5236 :     if (tlen > 0)
     224             152 :         return LIKE_FALSE;      /* end of pattern, but not of text */
     225                 : 
     226                 :     /*
     227                 :      * End of text, but perhaps not of pattern.  Match iff the remaining
     228                 :      * pattern can match a zero-length string, ie, it's zero or more %'s.
     229                 :      */
     230            5381 :     while (plen > 0 && *p == '%')
     231             297 :         NextByte(p, plen);
     232            5084 :     if (plen <= 0)
     233            2342 :         return LIKE_TRUE;
     234                 : 
     235                 :     /*
     236                 :      * End of text with no match, so no point in trying later places to start
     237                 :      * matching this pattern.
     238                 :      */
     239            2742 :     return LIKE_ABORT;
     240                 : }                               /* MatchText() */
     241                 : 
     242                 : /*
     243                 :  * like_escape() --- given a pattern and an ESCAPE string,
     244                 :  * convert the pattern to use Postgres' standard backslash escape convention.
     245                 :  */
     246                 : #ifdef do_like_escape
     247                 : 
     248                 : static text *
     249             112 : do_like_escape(text *pat, text *esc)
     250                 : {
     251                 :     text       *result;
     252                 :     char       *p,
     253                 :                *e,
     254                 :                *r;
     255                 :     int         plen,
     256                 :                 elen;
     257                 :     bool        afterescape;
     258                 : 
     259             112 :     p = VARDATA_ANY(pat);
     260             112 :     plen = VARSIZE_ANY_EXHDR(pat);
     261             112 :     e = VARDATA_ANY(esc);
     262             112 :     elen = VARSIZE_ANY_EXHDR(esc);
     263                 : 
     264                 :     /*
     265                 :      * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
     266                 :      * trying to calculate the size more accurately than that.
     267                 :      */
     268             112 :     result = (text *) palloc(plen * 2 + VARHDRSZ);
     269             112 :     r = VARDATA(result);
     270                 : 
     271             112 :     if (elen == 0)
     272                 :     {
     273                 :         /*
     274                 :          * No escape character is wanted.  Double any backslashes in the
     275                 :          * pattern to make them act like ordinary characters.
     276                 :          */
     277              64 :         while (plen > 0)
     278                 :         {
     279              48 :             if (*p == '\\')
     280 UBC           0 :                 *r++ = '\\';
     281 CBC          96 :             CopyAdvChar(r, p, plen);
     282                 :         }
     283                 :     }
     284                 :     else
     285                 :     {
     286                 :         /*
     287                 :          * The specified escape must be only a single character.
     288                 :          */
     289              96 :         NextChar(e, elen);
     290              96 :         if (elen != 0)
     291 UBC           0 :             ereport(ERROR,
     292                 :                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     293                 :                      errmsg("invalid escape string"),
     294                 :                      errhint("Escape string must be empty or one character.")));
     295                 : 
     296 CBC          96 :         e = VARDATA_ANY(esc);
     297                 : 
     298                 :         /*
     299                 :          * If specified escape is '\', just copy the pattern as-is.
     300                 :          */
     301              96 :         if (*e == '\\')
     302                 :         {
     303 UBC           0 :             memcpy(result, pat, VARSIZE_ANY(pat));
     304               0 :             return result;
     305                 :         }
     306                 : 
     307                 :         /*
     308                 :          * Otherwise, convert occurrences of the specified escape character to
     309                 :          * '\', and double occurrences of '\' --- unless they immediately
     310                 :          * follow an escape character!
     311                 :          */
     312 CBC          96 :         afterescape = false;
     313             582 :         while (plen > 0)
     314                 :         {
     315             486 :             if (CHAREQ(p, e) && !afterescape)
     316                 :             {
     317              96 :                 *r++ = '\\';
     318              96 :                 NextChar(p, plen);
     319              96 :                 afterescape = true;
     320                 :             }
     321             390 :             else if (*p == '\\')
     322                 :             {
     323 UBC           0 :                 *r++ = '\\';
     324               0 :                 if (!afterescape)
     325               0 :                     *r++ = '\\';
     326               0 :                 NextChar(p, plen);
     327               0 :                 afterescape = false;
     328                 :             }
     329                 :             else
     330                 :             {
     331 CBC         762 :                 CopyAdvChar(r, p, plen);
     332             390 :                 afterescape = false;
     333                 :             }
     334                 :         }
     335                 :     }
     336                 : 
     337             112 :     SET_VARSIZE(result, r - ((char *) result));
     338                 : 
     339             112 :     return result;
     340                 : }
     341                 : #endif                          /* do_like_escape */
     342                 : 
     343                 : #ifdef CHAREQ
     344                 : #undef CHAREQ
     345                 : #endif
     346                 : 
     347                 : #undef NextChar
     348                 : #undef CopyAdvChar
     349                 : #undef MatchText
     350                 : 
     351                 : #ifdef do_like_escape
     352                 : #undef do_like_escape
     353                 : #endif
     354                 : 
     355                 : #undef GETCHAR
     356                 : 
     357                 : #ifdef MATCH_LOWER
     358                 : #undef MATCH_LOWER
     359                 : 
     360                 : #endif
        

Generated by: LCOV version v1.16-55-g56c0a2a