LCOV - differential code coverage report
Current view: top level - contrib/pg_trgm - trgm_op.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 91.5 % 493 451 4 4 34 5 118 5 323 3 120 4
Current Date: 2023-04-08 15:15:32 Functions: 96.2 % 52 50 2 7 1 42 7
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*
       2                 :  * contrib/pg_trgm/trgm_op.c
       3                 :  */
       4                 : #include "postgres.h"
       5                 : 
       6                 : #include <ctype.h>
       7                 : 
       8                 : #include "catalog/pg_type.h"
       9                 : #include "lib/qunique.h"
      10                 : #include "miscadmin.h"
      11                 : #include "trgm.h"
      12                 : #include "tsearch/ts_locale.h"
      13                 : #include "utils/guc.h"
      14                 : #include "utils/lsyscache.h"
      15                 : #include "utils/memutils.h"
      16                 : #include "utils/pg_crc.h"
      17                 : 
      18 GIC           3 : PG_MODULE_MAGIC;
      19                 : 
      20 ECB             : /* GUC variables */
      21                 : double      similarity_threshold = 0.3f;
      22                 : double      word_similarity_threshold = 0.6f;
      23                 : double      strict_word_similarity_threshold = 0.5f;
      24                 : 
      25 CBC           2 : PG_FUNCTION_INFO_V1(set_limit);
      26               2 : PG_FUNCTION_INFO_V1(show_limit);
      27               2 : PG_FUNCTION_INFO_V1(show_trgm);
      28               2 : PG_FUNCTION_INFO_V1(similarity);
      29               2 : PG_FUNCTION_INFO_V1(word_similarity);
      30               2 : PG_FUNCTION_INFO_V1(strict_word_similarity);
      31               2 : PG_FUNCTION_INFO_V1(similarity_dist);
      32               2 : PG_FUNCTION_INFO_V1(similarity_op);
      33               2 : PG_FUNCTION_INFO_V1(word_similarity_op);
      34               2 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
      35               1 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
      36               2 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
      37               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
      38               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
      39               1 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
      40               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
      41                 : 
      42                 : /* Trigram with position */
      43                 : typedef struct
      44                 : {
      45                 :     trgm        trg;
      46                 :     int         index;
      47                 : } pos_trgm;
      48                 : 
      49                 : /* Trigram bound type */
      50                 : typedef uint8 TrgmBound;
      51                 : #define TRGM_BOUND_LEFT             0x01    /* trigram is left bound of word */
      52                 : #define TRGM_BOUND_RIGHT            0x02    /* trigram is right bound of word */
      53                 : 
      54                 : /* Word similarity flags */
      55                 : #define WORD_SIMILARITY_CHECK_ONLY  0x01    /* only check existence of similar
      56                 :                                              * search pattern in text */
      57                 : #define WORD_SIMILARITY_STRICT      0x02    /* force bounds of extent to match
      58                 :                                              * word bounds */
      59                 : 
      60                 : /*
      61                 :  * Module load callback
      62                 :  */
      63                 : void
      64               3 : _PG_init(void)
      65                 : {
      66                 :     /* Define custom GUC variables. */
      67               3 :     DefineCustomRealVariable("pg_trgm.similarity_threshold",
      68                 :                              "Sets the threshold used by the % operator.",
      69                 :                              "Valid range is 0.0 .. 1.0.",
      70                 :                              &similarity_threshold,
      71                 :                              0.3f,
      72                 :                              0.0,
      73                 :                              1.0,
      74                 :                              PGC_USERSET,
      75                 :                              0,
      76                 :                              NULL,
      77                 :                              NULL,
      78                 :                              NULL);
      79               3 :     DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
      80                 :                              "Sets the threshold used by the <% operator.",
      81                 :                              "Valid range is 0.0 .. 1.0.",
      82                 :                              &word_similarity_threshold,
      83                 :                              0.6f,
      84                 :                              0.0,
      85                 :                              1.0,
      86                 :                              PGC_USERSET,
      87                 :                              0,
      88                 :                              NULL,
      89                 :                              NULL,
      90                 :                              NULL);
      91               3 :     DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
      92                 :                              "Sets the threshold used by the <<% operator.",
      93                 :                              "Valid range is 0.0 .. 1.0.",
      94                 :                              &strict_word_similarity_threshold,
      95                 :                              0.5f,
      96                 :                              0.0,
      97                 :                              1.0,
      98                 :                              PGC_USERSET,
      99                 :                              0,
     100                 :                              NULL,
     101                 :                              NULL,
     102                 :                              NULL);
     103                 : 
     104               3 :     MarkGUCPrefixReserved("pg_trgm");
     105               3 : }
     106                 : 
     107                 : /*
     108                 :  * Deprecated function.
     109                 :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     110                 :  */
     111                 : Datum
     112               2 : set_limit(PG_FUNCTION_ARGS)
     113                 : {
     114               2 :     float4      nlimit = PG_GETARG_FLOAT4(0);
     115                 :     char       *nlimit_str;
     116                 :     Oid         func_out_oid;
     117                 :     bool        is_varlena;
     118                 : 
     119               2 :     getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
     120                 : 
     121               2 :     nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
     122                 : 
     123               2 :     SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
     124                 :                     PGC_USERSET, PGC_S_SESSION);
     125                 : 
     126               2 :     PG_RETURN_FLOAT4(similarity_threshold);
     127                 : }
     128                 : 
     129                 : 
     130                 : /*
     131                 :  * Get similarity threshold for given index scan strategy number.
     132                 :  */
     133                 : double
     134           44078 : index_strategy_get_limit(StrategyNumber strategy)
     135                 : {
     136           44078 :     switch (strategy)
     137                 :     {
     138           33278 :         case SimilarityStrategyNumber:
     139           33278 :             return similarity_threshold;
     140            4822 :         case WordSimilarityStrategyNumber:
     141            4822 :             return word_similarity_threshold;
     142            5978 :         case StrictWordSimilarityStrategyNumber:
     143            5978 :             return strict_word_similarity_threshold;
     144 UBC           0 :         default:
     145               0 :             elog(ERROR, "unrecognized strategy number: %d", strategy);
     146                 :             break;
     147                 :     }
     148                 : 
     149                 :     return 0.0;                 /* keep compiler quiet */
     150                 : }
     151                 : 
     152                 : /*
     153                 :  * Deprecated function.
     154                 :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
     155                 :  */
     156                 : Datum
     157 CBC       20000 : show_limit(PG_FUNCTION_ARGS)
     158                 : {
     159           20000 :     PG_RETURN_FLOAT4(similarity_threshold);
     160                 : }
     161                 : 
     162                 : static int
     163         3132620 : comp_trgm(const void *a, const void *b)
     164                 : {
     165         3132620 :     return CMPTRGM(a, b);
     166                 : }
     167                 : 
     168                 : /*
     169                 :  * Finds first word in string, returns pointer to the word,
     170                 :  * endword points to the character after word
     171                 :  */
     172                 : static char *
     173          239413 : find_word(char *str, int lenstr, char **endword, int *charlen)
     174                 : {
     175          239413 :     char       *beginword = str;
     176                 : 
     177          253079 :     while (beginword - str < lenstr && !ISWORDCHR(beginword))
     178           13666 :         beginword += pg_mblen(beginword);
     179                 : 
     180          239413 :     if (beginword - str >= lenstr)
     181          113080 :         return NULL;
     182                 : 
     183          126333 :     *endword = beginword;
     184          126333 :     *charlen = 0;
     185         1087687 :     while (*endword - str < lenstr && ISWORDCHR(*endword))
     186                 :     {
     187          961354 :         *endword += pg_mblen(*endword);
     188          961354 :         (*charlen)++;
     189                 :     }
     190                 : 
     191          126333 :     return beginword;
     192                 : }
     193                 : 
     194                 : /*
     195                 :  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
     196                 :  * which is always exactly three bytes.  If we have three single-byte
     197                 :  * characters, we just use them as-is; otherwise we form a hash value.
     198                 :  */
     199                 : void
     200            1459 : compact_trigram(trgm *tptr, char *str, int bytelen)
     201                 : {
     202            1459 :     if (bytelen == 3)
     203                 :     {
     204            1459 :         CPTRGM(tptr, str);
     205                 :     }
     206                 :     else
     207                 :     {
     208                 :         pg_crc32    crc;
     209                 : 
     210 UBC           0 :         INIT_LEGACY_CRC32(crc);
     211               0 :         COMP_LEGACY_CRC32(crc, str, bytelen);
     212               0 :         FIN_LEGACY_CRC32(crc);
     213                 : 
     214                 :         /*
     215                 :          * use only 3 upper bytes from crc, hope, it's good enough hashing
     216                 :          */
     217               0 :         CPTRGM(tptr, &crc);
     218                 :     }
     219 CBC        1459 : }
     220                 : 
     221                 : /*
     222                 :  * Adds trigrams from words (already padded).
     223                 :  */
     224                 : static trgm *
     225          126397 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
     226                 : {
     227          126397 :     char       *ptr = str;
     228                 : 
     229          126397 :     if (charlen < 3)
     230              27 :         return tptr;
     231                 : 
     232          126370 :     if (bytelen > charlen)
     233                 :     {
     234                 :         /* Find multibyte character boundaries and apply compact_trigram */
     235 UBC           0 :         int         lenfirst = pg_mblen(str),
     236               0 :                     lenmiddle = pg_mblen(str + lenfirst),
     237               0 :                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
     238                 : 
     239               0 :         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
     240                 :         {
     241               0 :             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
     242                 : 
     243               0 :             ptr += lenfirst;
     244               0 :             tptr++;
     245                 : 
     246               0 :             lenfirst = lenmiddle;
     247               0 :             lenmiddle = lenlast;
     248               0 :             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
     249                 :         }
     250                 :     }
     251                 :     else
     252                 :     {
     253                 :         /* Fast path when there are no multibyte characters */
     254 CBC      126370 :         Assert(bytelen == charlen);
     255                 : 
     256         1214148 :         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
     257                 :         {
     258         1087778 :             CPTRGM(tptr, ptr);
     259         1087778 :             ptr++;
     260         1087778 :             tptr++;
     261                 :         }
     262                 :     }
     263                 : 
     264          126370 :     return tptr;
     265                 : }
     266                 : 
     267                 : /*
     268                 :  * Make array of trigrams without sorting and removing duplicate items.
     269                 :  *
     270                 :  * trg: where to return the array of trigrams.
     271                 :  * str: source string, of length slen bytes.
     272                 :  * bounds: where to return bounds of trigrams (if needed).
     273                 :  *
     274                 :  * Returns length of the generated array.
     275                 :  */
     276                 : static int
     277          113081 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
     278                 : {
     279                 :     trgm       *tptr;
     280                 :     char       *buf;
     281                 :     int         charlen,
     282                 :                 bytelen;
     283                 :     char       *bword,
     284                 :                *eword;
     285                 : 
     286          113081 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     287               1 :         return 0;
     288                 : 
     289          113080 :     tptr = trg;
     290                 : 
     291                 :     /* Allocate a buffer for case-folded, blank-padded words */
     292          113080 :     buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
     293                 : 
     294                 :     if (LPADDING > 0)
     295                 :     {
     296          113080 :         *buf = ' ';
     297                 :         if (LPADDING > 1)
     298          113080 :             *(buf + 1) = ' ';
     299                 :     }
     300                 : 
     301          113080 :     eword = str;
     302          239413 :     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
     303                 :     {
     304                 : #ifdef IGNORECASE
     305          126333 :         bword = lowerstr_with_len(bword, eword - bword);
     306          126333 :         bytelen = strlen(bword);
     307                 : #else
     308                 :         bytelen = eword - bword;
     309                 : #endif
     310                 : 
     311          126333 :         memcpy(buf + LPADDING, bword, bytelen);
     312                 : 
     313                 : #ifdef IGNORECASE
     314          126333 :         pfree(bword);
     315                 : #endif
     316                 : 
     317          126333 :         buf[LPADDING + bytelen] = ' ';
     318          126333 :         buf[LPADDING + bytelen + 1] = ' ';
     319                 : 
     320                 :         /* Calculate trigrams marking their bounds if needed */
     321          126333 :         if (bounds)
     322           12400 :             bounds[tptr - trg] |= TRGM_BOUND_LEFT;
     323          126333 :         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
     324                 :                              charlen + LPADDING + RPADDING);
     325          126333 :         if (bounds)
     326           12400 :             bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
     327                 :     }
     328                 : 
     329          113080 :     pfree(buf);
     330                 : 
     331          113080 :     return tptr - trg;
     332                 : }
     333                 : 
     334                 : /*
     335                 :  * Guard against possible overflow in the palloc requests below.  (We
     336                 :  * don't worry about the additive constants, since palloc can detect
     337                 :  * requests that are a little above MaxAllocSize --- we just need to
     338                 :  * prevent integer overflow in the multiplications.)
     339                 :  */
     340                 : static void
     341          101010 : protect_out_of_mem(int slen)
     342                 : {
     343          101010 :     if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
     344          101010 :         (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
     345 UBC           0 :         ereport(ERROR,
     346                 :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     347                 :                  errmsg("out of memory")));
     348 CBC      101010 : }
     349                 : 
     350                 : /*
     351                 :  * Make array of trigrams with sorting and removing duplicate items.
     352                 :  *
     353                 :  * str: source string, of length slen bytes.
     354                 :  *
     355                 :  * Returns the sorted array of unique trigrams.
     356                 :  */
     357                 : TRGM *
     358           88829 : generate_trgm(char *str, int slen)
     359                 : {
     360                 :     TRGM       *trg;
     361                 :     int         len;
     362                 : 
     363           88829 :     protect_out_of_mem(slen);
     364                 : 
     365           88829 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     366           88829 :     trg->flag = ARRKEY;
     367                 : 
     368           88829 :     len = generate_trgm_only(GETARR(trg), str, slen, NULL);
     369           88829 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     370                 : 
     371           88829 :     if (len == 0)
     372               4 :         return trg;
     373                 : 
     374                 :     /*
     375                 :      * Make trigrams unique.
     376                 :      */
     377           88825 :     if (len > 1)
     378                 :     {
     379 GNC       88825 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     380 CBC       88825 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     381                 :     }
     382                 : 
     383           88825 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     384                 : 
     385           88825 :     return trg;
     386                 : }
     387                 : 
     388                 : /*
     389                 :  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
     390                 :  *
     391                 :  * trg1: trigram array of search pattern, of length len1. trg1 is required
     392                 :  *       word which positions don't matter and replaced with -1.
     393                 :  * trg2: trigram array of text, of length len2. trg2 is haystack where we
     394                 :  *       search and have to store its positions.
     395                 :  *
     396                 :  * Returns concatenated trigram array.
     397                 :  */
     398                 : static pos_trgm *
     399           12126 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
     400                 : {
     401                 :     pos_trgm   *result;
     402                 :     int         i,
     403           12126 :                 len = len1 + len2;
     404                 : 
     405           12126 :     result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
     406                 : 
     407          120864 :     for (i = 0; i < len1; i++)
     408                 :     {
     409          108738 :         memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
     410          108738 :         result[i].index = -1;
     411                 :     }
     412                 : 
     413          192225 :     for (i = 0; i < len2; i++)
     414                 :     {
     415          180099 :         memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
     416          180099 :         result[i + len1].index = i;
     417                 :     }
     418                 : 
     419           12126 :     return result;
     420                 : }
     421                 : 
     422                 : /*
     423                 :  * Compare position trigrams: compare trigrams first and position second.
     424                 :  */
     425                 : static int
     426         1307800 : comp_ptrgm(const void *v1, const void *v2)
     427                 : {
     428         1307800 :     const pos_trgm *p1 = (const pos_trgm *) v1;
     429         1307800 :     const pos_trgm *p2 = (const pos_trgm *) v2;
     430                 :     int         cmp;
     431                 : 
     432         1307800 :     cmp = CMPTRGM(p1->trg, p2->trg);
     433         1307800 :     if (cmp != 0)
     434         1268095 :         return cmp;
     435                 : 
     436           39705 :     if (p1->index < p2->index)
     437           21365 :         return -1;
     438           18340 :     else if (p1->index == p2->index)
     439 UBC           0 :         return 0;
     440                 :     else
     441 CBC       18340 :         return 1;
     442                 : }
     443                 : 
     444                 : /*
     445                 :  * Iterative search function which calculates maximum similarity with word in
     446                 :  * the string. Maximum similarity is only calculated only if the flag
     447                 :  * WORD_SIMILARITY_CHECK_ONLY isn't set.
     448                 :  *
     449                 :  * trg2indexes: array which stores indexes of the array "found".
     450                 :  * found: array which stores true of false values.
     451                 :  * ulen1: count of unique trigrams of array "trg1".
     452                 :  * len2: length of array "trg2" and array "trg2indexes".
     453                 :  * len: length of the array "found".
     454                 :  * flags: set of boolean flags parameterizing similarity calculation.
     455                 :  * bounds: whether each trigram is left/right bound of word.
     456                 :  *
     457                 :  * Returns word similarity.
     458                 :  */
     459                 : static float4
     460 GIC       12126 : iterate_word_similarity(int *trg2indexes,
     461 ECB             :                         bool *found,
     462                 :                         int ulen1,
     463                 :                         int len2,
     464                 :                         int len,
     465                 :                         uint8 flags,
     466                 :                         TrgmBound *bounds)
     467                 : {
     468                 :     int        *lastpos,
     469                 :                 i,
     470 GIC       12126 :                 ulen2 = 0,
     471 CBC       12126 :                 count = 0,
     472           12126 :                 upper = -1,
     473 ECB             :                 lower;
     474                 :     float4      smlr_cur,
     475 GIC       12126 :                 smlr_max = 0.0f;
     476 ECB             :     double      threshold;
     477                 : 
     478 GIC       12126 :     Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
     479 ECB             : 
     480                 :     /* Select appropriate threshold */
     481 GIC       24252 :     threshold = (flags & WORD_SIMILARITY_STRICT) ?
     482 CBC       12126 :         strict_word_similarity_threshold :
     483 ECB             :         word_similarity_threshold;
     484                 : 
     485                 :     /*
     486                 :      * Consider first trigram as initial lower bound for strict word
     487                 :      * similarity, or initialize it later with first trigram present for plain
     488                 :      * word similarity.
     489                 :      */
     490 GIC       12126 :     lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
     491 ECB             : 
     492                 :     /* Memorise last position of each trigram */
     493 GIC       12126 :     lastpos = (int *) palloc(sizeof(int) * len);
     494 CBC       12126 :     memset(lastpos, -1, sizeof(int) * len);
     495 ECB             : 
     496 GIC      183655 :     for (i = 0; i < len2; i++)
     497 ECB             :     {
     498                 :         int         trgindex;
     499                 : 
     500 GNC      173313 :         CHECK_FOR_INTERRUPTS();
     501                 : 
     502                 :         /* Get index of next trigram */
     503          173313 :         trgindex = trg2indexes[i];
     504                 : 
     505 ECB             :         /* Update last position of this trigram */
     506 GIC      173313 :         if (lower >= 0 || found[trgindex])
     507                 :         {
     508 CBC      135805 :             if (lastpos[trgindex] < 0)
     509                 :             {
     510 GIC      133952 :                 ulen2++;
     511 CBC      133952 :                 if (found[trgindex])
     512 GIC       30756 :                     count++;
     513 ECB             :             }
     514 GIC      135805 :             lastpos[trgindex] = i;
     515 ECB             :         }
     516                 : 
     517                 :         /*
     518                 :          * Adjust upper bound if trigram is upper bound of word for strict
     519                 :          * word similarity, or if trigram is present in required substring for
     520                 :          * plain word similarity
     521                 :          */
     522 GIC      250355 :         if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
     523           77042 :             : found[trgindex])
     524                 :         {
     525                 :             int         prev_lower,
     526                 :                         tmp_ulen2,
     527 ECB             :                         tmp_lower,
     528                 :                         tmp_count;
     529                 : 
     530 GIC       25638 :             upper = i;
     531           25638 :             if (lower == -1)
     532                 :             {
     533            4695 :                 lower = i;
     534            4695 :                 ulen2 = 1;
     535 ECB             :             }
     536                 : 
     537 GIC       25638 :             smlr_cur = CALCSML(count, ulen1, ulen2);
     538 ECB             : 
     539                 :             /* Also try to adjust lower bound for greater similarity */
     540 GIC       25638 :             tmp_count = count;
     541           25638 :             tmp_ulen2 = ulen2;
     542 CBC       25638 :             prev_lower = lower;
     543 GIC      208652 :             for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
     544                 :             {
     545 ECB             :                 float       smlr_tmp;
     546                 :                 int         tmp_trgindex;
     547                 : 
     548                 :                 /*
     549                 :                  * Adjust lower bound only if trigram is lower bound of word
     550                 :                  * for strict word similarity, or consider every trigram as
     551                 :                  * lower bound for plain word similarity.
     552                 :                  */
     553 GIC      184798 :                 if (!(flags & WORD_SIMILARITY_STRICT)
     554          145233 :                     || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
     555                 :                 {
     556           59704 :                     smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
     557           59704 :                     if (smlr_tmp > smlr_cur)
     558 ECB             :                     {
     559 CBC        3511 :                         smlr_cur = smlr_tmp;
     560 GIC        3511 :                         ulen2 = tmp_ulen2;
     561 CBC        3511 :                         lower = tmp_lower;
     562            3511 :                         count = tmp_count;
     563                 :                     }
     564 ECB             : 
     565                 :                     /*
     566                 :                      * If we only check that word similarity is greater than
     567                 :                      * threshold we do not need to calculate a maximum
     568                 :                      * similarity.
     569                 :                      */
     570 GIC       59704 :                     if ((flags & WORD_SIMILARITY_CHECK_ONLY)
     571           37114 :                         && smlr_cur >= threshold)
     572            1784 :                         break;
     573                 :                 }
     574                 : 
     575 CBC      183014 :                 tmp_trgindex = trg2indexes[tmp_lower];
     576          183014 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     577 ECB             :                 {
     578 GIC      180753 :                     tmp_ulen2--;
     579          180753 :                     if (found[tmp_trgindex])
     580 CBC       46591 :                         tmp_count--;
     581 ECB             :                 }
     582                 :             }
     583                 : 
     584 CBC       25638 :             smlr_max = Max(smlr_max, smlr_cur);
     585 ECB             : 
     586                 :             /*
     587                 :              * if we only check that word similarity is greater than threshold
     588                 :              * we do not need to calculate a maximum similarity.
     589                 :              */
     590 GIC       25638 :             if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
     591            1784 :                 break;
     592                 : 
     593           40602 :             for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
     594                 :             {
     595 ECB             :                 int         tmp_trgindex;
     596                 : 
     597 GIC       16748 :                 tmp_trgindex = trg2indexes[tmp_lower];
     598 CBC       16748 :                 if (lastpos[tmp_trgindex] == tmp_lower)
     599 GIC       16000 :                     lastpos[tmp_trgindex] = -1;
     600                 :             }
     601                 :         }
     602 ECB             :     }
     603                 : 
     604 CBC       12126 :     pfree(lastpos);
     605                 : 
     606 GIC       12126 :     return smlr_max;
     607                 : }
     608                 : 
     609 ECB             : /*
     610                 :  * Calculate word similarity.
     611                 :  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
     612                 :  * are used to calculate word similarity using iterate_word_similarity().
     613                 :  *
     614                 :  * "trg2indexes" is array which stores indexes of the array "found".
     615                 :  * In other words:
     616                 :  * trg2indexes[j] = i;
     617                 :  * found[i] = true (or false);
     618                 :  * If found[i] == true then there is trigram trg2[j] in array "trg1".
     619                 :  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
     620                 :  *
     621                 :  * str1: search pattern string, of length slen1 bytes.
     622                 :  * str2: text in which we are looking for a word, of length slen2 bytes.
     623                 :  * flags: set of boolean flags parameterizing similarity calculation.
     624                 :  *
     625                 :  * Returns word similarity.
     626                 :  */
     627                 : static float4
     628 GIC       12126 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
     629                 :                      uint8 flags)
     630                 : {
     631                 :     bool       *found;
     632                 :     pos_trgm   *ptrg;
     633 ECB             :     trgm       *trg1;
     634                 :     trgm       *trg2;
     635                 :     int         len1,
     636                 :                 len2,
     637                 :                 len,
     638                 :                 i,
     639                 :                 j,
     640                 :                 ulen1;
     641                 :     int        *trg2indexes;
     642                 :     float4      result;
     643                 :     TrgmBound  *bounds;
     644                 : 
     645 GIC       12126 :     protect_out_of_mem(slen1 + slen2);
     646                 : 
     647                 :     /* Make positional trigrams */
     648           12126 :     trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
     649           12126 :     trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
     650 CBC       12126 :     if (flags & WORD_SIMILARITY_STRICT)
     651 GIC        6662 :         bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
     652                 :     else
     653 CBC        5464 :         bounds = NULL;
     654 ECB             : 
     655 CBC       12126 :     len1 = generate_trgm_only(trg1, str1, slen1, NULL);
     656           12126 :     len2 = generate_trgm_only(trg2, str2, slen2, bounds);
     657                 : 
     658           12126 :     ptrg = make_positional_trgm(trg1, len1, trg2, len2);
     659 GIC       12126 :     len = len1 + len2;
     660 CBC       12126 :     qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
     661 ECB             : 
     662 GIC       12126 :     pfree(trg1);
     663 CBC       12126 :     pfree(trg2);
     664 ECB             : 
     665                 :     /*
     666                 :      * Merge positional trigrams array: enumerate each trigram and find its
     667                 :      * presence in required word.
     668                 :      */
     669 GIC       12126 :     trg2indexes = (int *) palloc(sizeof(int) * len2);
     670           12126 :     found = (bool *) palloc0(sizeof(bool) * len);
     671                 : 
     672           12126 :     ulen1 = 0;
     673           12126 :     j = 0;
     674 CBC      300963 :     for (i = 0; i < len; i++)
     675 ECB             :     {
     676 GIC      288837 :         if (i > 0)
     677 ECB             :         {
     678 CBC      276711 :             int         cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
     679 ECB             : 
     680 GIC      276711 :             if (cmp != 0)
     681 ECB             :             {
     682 GIC      242510 :                 if (found[j])
     683 CBC      101138 :                     ulen1++;
     684 GIC      242510 :                 j++;
     685 ECB             :             }
     686                 :         }
     687                 : 
     688 CBC      288837 :         if (ptrg[i].index >= 0)
     689 ECB             :         {
     690 GIC      180099 :             trg2indexes[ptrg[i].index] = j;
     691                 :         }
     692                 :         else
     693 ECB             :         {
     694 GIC      108738 :             found[j] = true;
     695 ECB             :         }
     696                 :     }
     697 GIC       12126 :     if (found[j])
     698            7600 :         ulen1++;
     699 ECB             : 
     700                 :     /* Run iterative procedure to find maximum similarity with word */
     701 GIC       12126 :     result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
     702 ECB             :                                      flags, bounds);
     703                 : 
     704 GIC       12126 :     pfree(trg2indexes);
     705           12126 :     pfree(found);
     706 CBC       12126 :     pfree(ptrg);
     707                 : 
     708 GIC       12126 :     return result;
     709 ECB             : }
     710                 : 
     711                 : 
     712                 : /*
     713                 :  * Extract the next non-wildcard part of a search string, i.e. a word bounded
     714                 :  * by '_' or '%' meta-characters, non-word characters or string end.
     715                 :  *
     716                 :  * str: source string, of length lenstr bytes (need not be null-terminated)
     717                 :  * buf: where to return the substring (must be long enough)
     718                 :  * *bytelen: receives byte length of the found substring
     719                 :  * *charlen: receives character length of the found substring
     720                 :  *
     721                 :  * Returns pointer to end+1 of the found substring in the source string.
     722                 :  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
     723                 :  *
     724                 :  * If the found word is bounded by non-word characters or string boundaries
     725                 :  * then this function will include corresponding padding spaces into buf.
     726                 :  */
     727                 : static const char *
     728 GIC         119 : get_wildcard_part(const char *str, int lenstr,
     729                 :                   char *buf, int *bytelen, int *charlen)
     730                 : {
     731             119 :     const char *beginword = str;
     732                 :     const char *endword;
     733 CBC         119 :     char       *s = buf;
     734 GIC         119 :     bool        in_leading_wildcard_meta = false;
     735             119 :     bool        in_trailing_wildcard_meta = false;
     736 CBC         119 :     bool        in_escape = false;
     737                 :     int         clen;
     738 ECB             : 
     739                 :     /*
     740                 :      * Find the first word character, remembering whether preceding character
     741                 :      * was wildcard meta-character.  Note that the in_escape state persists
     742                 :      * from this loop to the next one, since we may exit at a word character
     743                 :      * that is in_escape.
     744                 :      */
     745 GIC         241 :     while (beginword - str < lenstr)
     746                 :     {
     747             186 :         if (in_escape)
     748                 :         {
     749               3 :             if (ISWORDCHR(beginword))
     750 CBC           3 :                 break;
     751 UIC           0 :             in_escape = false;
     752 LBC           0 :             in_leading_wildcard_meta = false;
     753                 :         }
     754 ECB             :         else
     755                 :         {
     756 GBC         183 :             if (ISESCAPECHAR(beginword))
     757               3 :                 in_escape = true;
     758 GIC         180 :             else if (ISWILDCARDCHAR(beginword))
     759             104 :                 in_leading_wildcard_meta = true;
     760              76 :             else if (ISWORDCHR(beginword))
     761 CBC          61 :                 break;
     762 ECB             :             else
     763 CBC          15 :                 in_leading_wildcard_meta = false;
     764 ECB             :         }
     765 CBC         122 :         beginword += pg_mblen(beginword);
     766 ECB             :     }
     767                 : 
     768                 :     /*
     769                 :      * Handle string end.
     770                 :      */
     771 GIC         119 :     if (beginword - str >= lenstr)
     772              55 :         return NULL;
     773                 : 
     774                 :     /*
     775                 :      * Add left padding spaces if preceding character wasn't wildcard
     776 ECB             :      * meta-character.
     777                 :      */
     778 GIC          64 :     *charlen = 0;
     779              64 :     if (!in_leading_wildcard_meta)
     780                 :     {
     781                 :         if (LPADDING > 0)
     782                 :         {
     783 CBC          15 :             *s++ = ' ';
     784              15 :             (*charlen)++;
     785                 :             if (LPADDING > 1)
     786                 :             {
     787 GIC          15 :                 *s++ = ' ';
     788 CBC          15 :                 (*charlen)++;
     789 ECB             :             }
     790                 :         }
     791                 :     }
     792                 : 
     793                 :     /*
     794                 :      * Copy data into buf until wildcard meta-character, non-word character or
     795                 :      * string boundary.  Strip escapes during copy.
     796                 :      */
     797 GIC          64 :     endword = beginword;
     798             244 :     while (endword - str < lenstr)
     799                 :     {
     800             244 :         clen = pg_mblen(endword);
     801             244 :         if (in_escape)
     802 ECB             :         {
     803 CBC           3 :             if (ISWORDCHR(endword))
     804                 :             {
     805               3 :                 memcpy(s, endword, clen);
     806               3 :                 (*charlen)++;
     807 GIC           3 :                 s += clen;
     808 ECB             :             }
     809                 :             else
     810                 :             {
     811                 :                 /*
     812                 :                  * Back up endword to the escape character when stopping at an
     813                 :                  * escaped char, so that subsequent get_wildcard_part will
     814                 :                  * restart from the escape character.  We assume here that
     815                 :                  * escape chars are single-byte.
     816                 :                  */
     817 UIC           0 :                 endword--;
     818               0 :                 break;
     819                 :             }
     820 GIC           3 :             in_escape = false;
     821                 :         }
     822 EUB             :         else
     823                 :         {
     824 GIC         241 :             if (ISESCAPECHAR(endword))
     825 LBC           0 :                 in_escape = true;
     826 GIC         241 :             else if (ISWILDCARDCHAR(endword))
     827                 :             {
     828              55 :                 in_trailing_wildcard_meta = true;
     829 CBC          55 :                 break;
     830 EUB             :             }
     831 CBC         186 :             else if (ISWORDCHR(endword))
     832                 :             {
     833             177 :                 memcpy(s, endword, clen);
     834             177 :                 (*charlen)++;
     835 GIC         177 :                 s += clen;
     836 ECB             :             }
     837                 :             else
     838 CBC           9 :                 break;
     839 ECB             :         }
     840 CBC         180 :         endword += clen;
     841                 :     }
     842                 : 
     843 ECB             :     /*
     844                 :      * Add right padding spaces if next character isn't wildcard
     845                 :      * meta-character.
     846                 :      */
     847 GIC          64 :     if (!in_trailing_wildcard_meta)
     848                 :     {
     849                 :         if (RPADDING > 0)
     850                 :         {
     851               9 :             *s++ = ' ';
     852 CBC           9 :             (*charlen)++;
     853                 :             if (RPADDING > 1)
     854                 :             {
     855                 :                 *s++ = ' ';
     856 ECB             :                 (*charlen)++;
     857                 :             }
     858                 :         }
     859                 :     }
     860                 : 
     861 GIC          64 :     *bytelen = s - buf;
     862              64 :     return endword;
     863                 : }
     864                 : 
     865                 : /*
     866 ECB             :  * Generates trigrams for wildcard search string.
     867                 :  *
     868                 :  * Returns array of trigrams that must occur in any string that matches the
     869                 :  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
     870                 :  * " a", "bcd" would be extracted.
     871                 :  */
     872                 : TRGM *
     873 GIC          55 : generate_wildcard_trgm(const char *str, int slen)
     874                 : {
     875                 :     TRGM       *trg;
     876                 :     char       *buf,
     877                 :                *buf2;
     878 ECB             :     trgm       *tptr;
     879                 :     int         len,
     880                 :                 charlen,
     881                 :                 bytelen;
     882                 :     const char *eword;
     883                 : 
     884 GIC          55 :     protect_out_of_mem(slen);
     885                 : 
     886              55 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
     887              55 :     trg->flag = ARRKEY;
     888              55 :     SET_VARSIZE(trg, TRGMHDRSIZE);
     889 ECB             : 
     890 GIC          55 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
     891 LBC           0 :         return trg;
     892 ECB             : 
     893 CBC          55 :     tptr = GETARR(trg);
     894                 : 
     895 ECB             :     /* Allocate a buffer for blank-padded, but not yet case-folded, words */
     896 GBC          55 :     buf = palloc(sizeof(char) * (slen + 4));
     897                 : 
     898 ECB             :     /*
     899                 :      * Extract trigrams from each substring extracted by get_wildcard_part.
     900                 :      */
     901 CBC          55 :     eword = str;
     902 GIC         119 :     while ((eword = get_wildcard_part(eword, slen - (eword - str),
     903             119 :                                       buf, &bytelen, &charlen)) != NULL)
     904                 :     {
     905                 : #ifdef IGNORECASE
     906 CBC          64 :         buf2 = lowerstr_with_len(buf, bytelen);
     907              64 :         bytelen = strlen(buf2);
     908 ECB             : #else
     909                 :         buf2 = buf;
     910                 : #endif
     911                 : 
     912                 :         /*
     913                 :          * count trigrams
     914                 :          */
     915 GIC          64 :         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
     916                 : 
     917                 : #ifdef IGNORECASE
     918              64 :         pfree(buf2);
     919                 : #endif
     920 ECB             :     }
     921                 : 
     922 GIC          55 :     pfree(buf);
     923 ECB             : 
     924 GIC          55 :     if ((len = tptr - GETARR(trg)) == 0)
     925              24 :         return trg;
     926                 : 
     927 ECB             :     /*
     928                 :      * Make trigrams unique.
     929                 :      */
     930 CBC          31 :     if (len > 1)
     931                 :     {
     932 GNC          17 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
     933 GIC          17 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
     934                 :     }
     935 ECB             : 
     936 GIC          31 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
     937 ECB             : 
     938 CBC          31 :     return trg;
     939                 : }
     940                 : 
     941 ECB             : uint32
     942 GIC       34773 : trgm2int(trgm *ptr)
     943 ECB             : {
     944 GIC       34773 :     uint32      val = 0;
     945                 : 
     946           34773 :     val |= *(((unsigned char *) ptr));
     947 CBC       34773 :     val <<= 8;
     948 GIC       34773 :     val |= *(((unsigned char *) ptr) + 1);
     949 CBC       34773 :     val <<= 8;
     950 GIC       34773 :     val |= *(((unsigned char *) ptr) + 2);
     951 ECB             : 
     952 CBC       34773 :     return val;
     953 ECB             : }
     954                 : 
     955                 : Datum
     956 GIC           7 : show_trgm(PG_FUNCTION_ARGS)
     957 ECB             : {
     958 GIC           7 :     text       *in = PG_GETARG_TEXT_PP(0);
     959                 :     TRGM       *trg;
     960                 :     Datum      *d;
     961 ECB             :     ArrayType  *a;
     962                 :     trgm       *ptr;
     963                 :     int         i;
     964                 : 
     965 GIC           7 :     trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
     966               7 :     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
     967                 : 
     968              44 :     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
     969                 :     {
     970 CBC          37 :         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
     971 ECB             : 
     972 GIC          37 :         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
     973 ECB             :         {
     974 UIC           0 :             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
     975 LBC           0 :             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
     976                 :         }
     977 ECB             :         else
     978                 :         {
     979 GBC          37 :             SET_VARSIZE(item, VARHDRSZ + 3);
     980              37 :             CPTRGM(VARDATA(item), ptr);
     981                 :         }
     982 GIC          37 :         d[i] = PointerGetDatum(item);
     983                 :     }
     984 ECB             : 
     985 GNC           7 :     a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
     986                 : 
     987 CBC          44 :     for (i = 0; i < ARRNELEM(trg); i++)
     988              37 :         pfree(DatumGetPointer(d[i]));
     989                 : 
     990               7 :     pfree(d);
     991               7 :     pfree(trg);
     992               7 :     PG_FREE_IF_COPY(in, 0);
     993                 : 
     994               7 :     PG_RETURN_POINTER(a);
     995                 : }
     996                 : 
     997                 : float4
     998           69791 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
     999                 : {
    1000                 :     trgm       *ptr1,
    1001                 :                *ptr2;
    1002           69791 :     int         count = 0;
    1003                 :     int         len1,
    1004                 :                 len2;
    1005                 : 
    1006           69791 :     ptr1 = GETARR(trg1);
    1007           69791 :     ptr2 = GETARR(trg2);
    1008                 : 
    1009           69791 :     len1 = ARRNELEM(trg1);
    1010           69791 :     len2 = ARRNELEM(trg2);
    1011                 : 
    1012                 :     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
    1013           69791 :     if (len1 <= 0 || len2 <= 0)
    1014               1 :         return (float4) 0.0;
    1015                 : 
    1016          891582 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1017                 :     {
    1018          821792 :         int         res = CMPTRGM(ptr1, ptr2);
    1019                 : 
    1020          821792 :         if (res < 0)
    1021          189653 :             ptr1++;
    1022          632139 :         else if (res > 0)
    1023          220022 :             ptr2++;
    1024                 :         else
    1025                 :         {
    1026          412117 :             ptr1++;
    1027          412117 :             ptr2++;
    1028          412117 :             count++;
    1029                 :         }
    1030                 :     }
    1031                 : 
    1032                 :     /*
    1033                 :      * If inexact then len2 is equal to count, because we don't know actual
    1034                 :      * length of second string in inexact search and we can assume that count
    1035                 :      * is a lower bound of len2.
    1036                 :      */
    1037           69790 :     return CALCSML(count, len1, inexact ? count : len2);
    1038                 : }
    1039                 : 
    1040                 : 
    1041                 : /*
    1042                 :  * Returns whether trg2 contains all trigrams in trg1.
    1043                 :  * This relies on the trigram arrays being sorted.
    1044                 :  */
    1045                 : bool
    1046             190 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
    1047                 : {
    1048                 :     trgm       *ptr1,
    1049                 :                *ptr2;
    1050                 :     int         len1,
    1051                 :                 len2;
    1052                 : 
    1053             190 :     ptr1 = GETARR(trg1);
    1054             190 :     ptr2 = GETARR(trg2);
    1055                 : 
    1056             190 :     len1 = ARRNELEM(trg1);
    1057             190 :     len2 = ARRNELEM(trg2);
    1058                 : 
    1059             622 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
    1060                 :     {
    1061             599 :         int         res = CMPTRGM(ptr1, ptr2);
    1062                 : 
    1063             599 :         if (res < 0)
    1064             167 :             return false;
    1065             432 :         else if (res > 0)
    1066             320 :             ptr2++;
    1067                 :         else
    1068                 :         {
    1069             112 :             ptr1++;
    1070             112 :             ptr2++;
    1071                 :         }
    1072                 :     }
    1073              23 :     if (ptr1 - GETARR(trg1) < len1)
    1074               4 :         return false;
    1075                 :     else
    1076              19 :         return true;
    1077                 : }
    1078                 : 
    1079                 : /*
    1080                 :  * Return a palloc'd boolean array showing, for each trigram in "query",
    1081                 :  * whether it is present in the trigram array "key".
    1082                 :  * This relies on the "key" array being sorted, but "query" need not be.
    1083                 :  */
    1084                 : bool *
    1085            2150 : trgm_presence_map(TRGM *query, TRGM *key)
    1086                 : {
    1087                 :     bool       *result;
    1088            2150 :     trgm       *ptrq = GETARR(query),
    1089            2150 :                *ptrk = GETARR(key);
    1090            2150 :     int         lenq = ARRNELEM(query),
    1091            2150 :                 lenk = ARRNELEM(key),
    1092                 :                 i;
    1093                 : 
    1094            2150 :     result = (bool *) palloc0(lenq * sizeof(bool));
    1095                 : 
    1096                 :     /* for each query trigram, do a binary search in the key array */
    1097          507560 :     for (i = 0; i < lenq; i++)
    1098                 :     {
    1099          505410 :         int         lo = 0;
    1100          505410 :         int         hi = lenk;
    1101                 : 
    1102         2373653 :         while (lo < hi)
    1103                 :         {
    1104         1876282 :             int         mid = (lo + hi) / 2;
    1105         1876282 :             int         res = CMPTRGM(ptrq, ptrk + mid);
    1106                 : 
    1107         1876282 :             if (res < 0)
    1108          784082 :                 hi = mid;
    1109         1092200 :             else if (res > 0)
    1110         1084161 :                 lo = mid + 1;
    1111                 :             else
    1112                 :             {
    1113            8039 :                 result[i] = true;
    1114            8039 :                 break;
    1115                 :             }
    1116                 :         }
    1117          505410 :         ptrq++;
    1118                 :     }
    1119                 : 
    1120            2150 :     return result;
    1121                 : }
    1122                 : 
    1123                 : Datum
    1124           31452 : similarity(PG_FUNCTION_ARGS)
    1125                 : {
    1126           31452 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1127           31452 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1128                 :     TRGM       *trg1,
    1129                 :                *trg2;
    1130                 :     float4      res;
    1131                 : 
    1132           31452 :     trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
    1133           31452 :     trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
    1134                 : 
    1135           31452 :     res = cnt_sml(trg1, trg2, false);
    1136                 : 
    1137           31452 :     pfree(trg1);
    1138           31452 :     pfree(trg2);
    1139           31452 :     PG_FREE_IF_COPY(in1, 0);
    1140           31452 :     PG_FREE_IF_COPY(in2, 1);
    1141                 : 
    1142           31452 :     PG_RETURN_FLOAT4(res);
    1143                 : }
    1144                 : 
    1145                 : Datum
    1146             902 : word_similarity(PG_FUNCTION_ARGS)
    1147                 : {
    1148             902 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1149             902 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1150                 :     float4      res;
    1151                 : 
    1152            1804 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1153            1804 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1154                 :                                0);
    1155                 : 
    1156             902 :     PG_FREE_IF_COPY(in1, 0);
    1157             902 :     PG_FREE_IF_COPY(in2, 1);
    1158             902 :     PG_RETURN_FLOAT4(res);
    1159                 : }
    1160                 : 
    1161                 : Datum
    1162             882 : strict_word_similarity(PG_FUNCTION_ARGS)
    1163                 : {
    1164             882 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1165             882 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1166                 :     float4      res;
    1167                 : 
    1168            1764 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1169            1764 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1170                 :                                WORD_SIMILARITY_STRICT);
    1171                 : 
    1172             882 :     PG_FREE_IF_COPY(in1, 0);
    1173             882 :     PG_FREE_IF_COPY(in2, 1);
    1174             882 :     PG_RETURN_FLOAT4(res);
    1175                 : }
    1176                 : 
    1177                 : Datum
    1178            1004 : similarity_dist(PG_FUNCTION_ARGS)
    1179                 : {
    1180            1004 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1181                 :                                                          PG_GETARG_DATUM(0),
    1182                 :                                                          PG_GETARG_DATUM(1)));
    1183                 : 
    1184            1004 :     PG_RETURN_FLOAT4(1.0 - res);
    1185                 : }
    1186                 : 
    1187                 : Datum
    1188            6000 : similarity_op(PG_FUNCTION_ARGS)
    1189                 : {
    1190            6000 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
    1191                 :                                                          PG_GETARG_DATUM(0),
    1192                 :                                                          PG_GETARG_DATUM(1)));
    1193                 : 
    1194            6000 :     PG_RETURN_BOOL(res >= similarity_threshold);
    1195                 : }
    1196                 : 
    1197                 : Datum
    1198            1924 : word_similarity_op(PG_FUNCTION_ARGS)
    1199                 : {
    1200            1924 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1201            1924 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1202                 :     float4      res;
    1203                 : 
    1204            3848 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1205            3848 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1206                 :                                WORD_SIMILARITY_CHECK_ONLY);
    1207                 : 
    1208            1924 :     PG_FREE_IF_COPY(in1, 0);
    1209            1924 :     PG_FREE_IF_COPY(in2, 1);
    1210            1924 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1211                 : }
    1212                 : 
    1213                 : Datum
    1214            1924 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1215                 : {
    1216            1924 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1217            1924 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1218                 :     float4      res;
    1219                 : 
    1220            3848 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1221            3848 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1222                 :                                WORD_SIMILARITY_CHECK_ONLY);
    1223                 : 
    1224            1924 :     PG_FREE_IF_COPY(in1, 0);
    1225            1924 :     PG_FREE_IF_COPY(in2, 1);
    1226            1924 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
    1227                 : }
    1228                 : 
    1229                 : Datum
    1230 UBC           0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
    1231                 : {
    1232               0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1233               0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1234                 :     float4      res;
    1235                 : 
    1236               0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1237               0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1238                 :                                0);
    1239                 : 
    1240               0 :     PG_FREE_IF_COPY(in1, 0);
    1241               0 :     PG_FREE_IF_COPY(in2, 1);
    1242               0 :     PG_RETURN_FLOAT4(1.0 - res);
    1243                 : }
    1244                 : 
    1245                 : Datum
    1246 CBC         714 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1247                 : {
    1248             714 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1249             714 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1250                 :     float4      res;
    1251                 : 
    1252            1428 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1253            1428 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1254                 :                                0);
    1255                 : 
    1256             714 :     PG_FREE_IF_COPY(in1, 0);
    1257             714 :     PG_FREE_IF_COPY(in2, 1);
    1258             714 :     PG_RETURN_FLOAT4(1.0 - res);
    1259                 : }
    1260                 : 
    1261                 : Datum
    1262            2530 : strict_word_similarity_op(PG_FUNCTION_ARGS)
    1263                 : {
    1264            2530 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1265            2530 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1266                 :     float4      res;
    1267                 : 
    1268            5060 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1269            5060 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1270                 :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1271                 : 
    1272            2530 :     PG_FREE_IF_COPY(in1, 0);
    1273            2530 :     PG_FREE_IF_COPY(in2, 1);
    1274            2530 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1275                 : }
    1276                 : 
    1277                 : Datum
    1278            2530 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
    1279                 : {
    1280            2530 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1281            2530 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1282                 :     float4      res;
    1283                 : 
    1284            5060 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1285            5060 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1286                 :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
    1287                 : 
    1288            2530 :     PG_FREE_IF_COPY(in1, 0);
    1289            2530 :     PG_FREE_IF_COPY(in2, 1);
    1290            2530 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
    1291                 : }
    1292                 : 
    1293                 : Datum
    1294 UBC           0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
    1295                 : {
    1296               0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1297               0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1298                 :     float4      res;
    1299                 : 
    1300               0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1301               0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1302                 :                                WORD_SIMILARITY_STRICT);
    1303                 : 
    1304               0 :     PG_FREE_IF_COPY(in1, 0);
    1305               0 :     PG_FREE_IF_COPY(in2, 1);
    1306               0 :     PG_RETURN_FLOAT4(1.0 - res);
    1307                 : }
    1308                 : 
    1309                 : Datum
    1310 CBC         720 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
    1311                 : {
    1312             720 :     text       *in1 = PG_GETARG_TEXT_PP(0);
    1313             720 :     text       *in2 = PG_GETARG_TEXT_PP(1);
    1314                 :     float4      res;
    1315                 : 
    1316            1440 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
    1317            1440 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
    1318                 :                                WORD_SIMILARITY_STRICT);
    1319                 : 
    1320             720 :     PG_FREE_IF_COPY(in1, 0);
    1321             720 :     PG_FREE_IF_COPY(in2, 1);
    1322             720 :     PG_RETURN_FLOAT4(1.0 - res);
    1323                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a