LCOV - differential code coverage report
Current view: top level - contrib/pg_trgm - trgm_op.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 91.5 % 493 451 4 4 34 5 118 5 323 3 120 4
Current Date: 2023-04-08 17:13:01 Functions: 96.2 % 52 50 2 7 1 42 7
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (60,120] days: 100.0 % 2 2 2
Legend: Lines: hit not hit (180,240] days: 100.0 % 2 2 2
(240..) days: 91.4 % 489 447 4 4 34 5 118 1 323 3 120
Function coverage date bins:
(240..) days: 84.7 % 59 50 2 7 1 42 7

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*
                                  2                 :  * contrib/pg_trgm/trgm_op.c
                                  3                 :  */
                                  4                 : #include "postgres.h"
                                  5                 : 
                                  6                 : #include <ctype.h>
                                  7                 : 
                                  8                 : #include "catalog/pg_type.h"
                                  9                 : #include "lib/qunique.h"
                                 10                 : #include "miscadmin.h"
                                 11                 : #include "trgm.h"
                                 12                 : #include "tsearch/ts_locale.h"
                                 13                 : #include "utils/guc.h"
                                 14                 : #include "utils/lsyscache.h"
                                 15                 : #include "utils/memutils.h"
                                 16                 : #include "utils/pg_crc.h"
                                 17                 : 
 6158 tgl                        18 GIC           3 : PG_MODULE_MAGIC;
                                 19                 : 
 2580 teodor                     20 ECB             : /* GUC variables */
                                 21                 : double      similarity_threshold = 0.3f;
                                 22                 : double      word_similarity_threshold = 0.6f;
                                 23                 : double      strict_word_similarity_threshold = 0.5f;
                                 24                 : 
 6887 teodor                     25 CBC           2 : PG_FUNCTION_INFO_V1(set_limit);
 4451 tgl                        26               2 : PG_FUNCTION_INFO_V1(show_limit);
                                 27               2 : PG_FUNCTION_INFO_V1(show_trgm);
                                 28               2 : PG_FUNCTION_INFO_V1(similarity);
 2580 teodor                     29               2 : PG_FUNCTION_INFO_V1(word_similarity);
 1845                            30               2 : PG_FUNCTION_INFO_V1(strict_word_similarity);
 4451 tgl                        31               2 : PG_FUNCTION_INFO_V1(similarity_dist);
                                 32               2 : PG_FUNCTION_INFO_V1(similarity_op);
 2580 teodor                     33               2 : PG_FUNCTION_INFO_V1(word_similarity_op);
                                 34               2 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
                                 35               1 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
                                 36               2 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
 1845                            37               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
                                 38               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
                                 39               1 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
                                 40               2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
                                 41                 : 
                                 42                 : /* Trigram with position */
                                 43                 : typedef struct
                                 44                 : {
                                 45                 :     trgm        trg;
                                 46                 :     int         index;
                                 47                 : } pos_trgm;
                                 48                 : 
                                 49                 : /* Trigram bound type */
                                 50                 : typedef uint8 TrgmBound;
                                 51                 : #define TRGM_BOUND_LEFT             0x01    /* trigram is left bound of word */
                                 52                 : #define TRGM_BOUND_RIGHT            0x02    /* trigram is right bound of word */
                                 53                 : 
                                 54                 : /* Word similarity flags */
                                 55                 : #define WORD_SIMILARITY_CHECK_ONLY  0x01    /* only check existence of similar
                                 56                 :                                              * search pattern in text */
                                 57                 : #define WORD_SIMILARITY_STRICT      0x02    /* force bounds of extent to match
                                 58                 :                                              * word bounds */
                                 59                 : 
                                 60                 : /*
                                 61                 :  * Module load callback
                                 62                 :  */
                                 63                 : void
 2580                            64               3 : _PG_init(void)
                                 65                 : {
                                 66                 :     /* Define custom GUC variables. */
                                 67               3 :     DefineCustomRealVariable("pg_trgm.similarity_threshold",
                                 68                 :                              "Sets the threshold used by the % operator.",
                                 69                 :                              "Valid range is 0.0 .. 1.0.",
                                 70                 :                              &similarity_threshold,
                                 71                 :                              0.3f,
                                 72                 :                              0.0,
                                 73                 :                              1.0,
                                 74                 :                              PGC_USERSET,
                                 75                 :                              0,
                                 76                 :                              NULL,
                                 77                 :                              NULL,
                                 78                 :                              NULL);
                                 79               3 :     DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
                                 80                 :                              "Sets the threshold used by the <% operator.",
                                 81                 :                              "Valid range is 0.0 .. 1.0.",
                                 82                 :                              &word_similarity_threshold,
                                 83                 :                              0.6f,
                                 84                 :                              0.0,
                                 85                 :                              1.0,
                                 86                 :                              PGC_USERSET,
                                 87                 :                              0,
                                 88                 :                              NULL,
                                 89                 :                              NULL,
                                 90                 :                              NULL);
 1845                            91               3 :     DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
                                 92                 :                              "Sets the threshold used by the <<% operator.",
                                 93                 :                              "Valid range is 0.0 .. 1.0.",
                                 94                 :                              &strict_word_similarity_threshold,
                                 95                 :                              0.5f,
                                 96                 :                              0.0,
                                 97                 :                              1.0,
                                 98                 :                              PGC_USERSET,
                                 99                 :                              0,
                                100                 :                              NULL,
                                101                 :                              NULL,
                                102                 :                              NULL);
                                103                 : 
  412 tgl                       104               3 :     MarkGUCPrefixReserved("pg_trgm");
 2580 teodor                    105               3 : }
                                106                 : 
                                107                 : /*
                                108                 :  * Deprecated function.
                                109                 :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
                                110                 :  */
                                111                 : Datum
 6797 bruce                     112               2 : set_limit(PG_FUNCTION_ARGS)
                                113                 : {
                                114               2 :     float4      nlimit = PG_GETARG_FLOAT4(0);
                                115                 :     char       *nlimit_str;
                                116                 :     Oid         func_out_oid;
                                117                 :     bool        is_varlena;
                                118                 : 
 2578 teodor                    119               2 :     getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
                                120                 : 
                                121               2 :     nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
                                122                 : 
                                123               2 :     SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
                                124                 :                     PGC_USERSET, PGC_S_SESSION);
                                125                 : 
 2580                           126               2 :     PG_RETURN_FLOAT4(similarity_threshold);
                                127                 : }
                                128                 : 
                                129                 : 
                                130                 : /*
                                131                 :  * Get similarity threshold for given index scan strategy number.
                                132                 :  */
                                133                 : double
 1845                           134           44078 : index_strategy_get_limit(StrategyNumber strategy)
                                135                 : {
                                136           44078 :     switch (strategy)
                                137                 :     {
                                138           33278 :         case SimilarityStrategyNumber:
                                139           33278 :             return similarity_threshold;
                                140            4822 :         case WordSimilarityStrategyNumber:
                                141            4822 :             return word_similarity_threshold;
                                142            5978 :         case StrictWordSimilarityStrategyNumber:
                                143            5978 :             return strict_word_similarity_threshold;
 1845 teodor                    144 UBC           0 :         default:
                                145               0 :             elog(ERROR, "unrecognized strategy number: %d", strategy);
                                146                 :             break;
                                147                 :     }
                                148                 : 
                                149                 :     return 0.0;                 /* keep compiler quiet */
                                150                 : }
                                151                 : 
                                152                 : /*
                                153                 :  * Deprecated function.
                                154                 :  * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
                                155                 :  */
                                156                 : Datum
 6797 bruce                     157 CBC       20000 : show_limit(PG_FUNCTION_ARGS)
                                158                 : {
 2580 teodor                    159           20000 :     PG_RETURN_FLOAT4(similarity_threshold);
                                160                 : }
                                161                 : 
                                162                 : static int
 6797 bruce                     163         3132620 : comp_trgm(const void *a, const void *b)
                                164                 : {
                                165         3132620 :     return CMPTRGM(a, b);
                                166                 : }
                                167                 : 
                                168                 : /*
                                169                 :  * Finds first word in string, returns pointer to the word,
                                170                 :  * endword points to the character after word
                                171                 :  */
                                172                 : static char *
 5050                           173          239413 : find_word(char *str, int lenstr, char **endword, int *charlen)
                                174                 : {
                                175          239413 :     char       *beginword = str;
                                176                 : 
 3652 tgl                       177          253079 :     while (beginword - str < lenstr && !ISWORDCHR(beginword))
 5261 teodor                    178           13666 :         beginword += pg_mblen(beginword);
                                179                 : 
                                180          239413 :     if (beginword - str >= lenstr)
                                181          113080 :         return NULL;
                                182                 : 
                                183          126333 :     *endword = beginword;
                                184          126333 :     *charlen = 0;
 3652 tgl                       185         1087687 :     while (*endword - str < lenstr && ISWORDCHR(*endword))
                                186                 :     {
 5261 teodor                    187          961354 :         *endword += pg_mblen(*endword);
                                188          961354 :         (*charlen)++;
                                189                 :     }
                                190                 : 
                                191          126333 :     return beginword;
                                192                 : }
                                193                 : 
                                194                 : /*
                                195                 :  * Reduce a trigram (three possibly multi-byte characters) to a trgm,
                                196                 :  * which is always exactly three bytes.  If we have three single-byte
                                197                 :  * characters, we just use them as-is; otherwise we form a hash value.
                                198                 :  */
                                199                 : void
 3654 tgl                       200            1459 : compact_trigram(trgm *tptr, char *str, int bytelen)
                                201                 : {
 5050 bruce                     202            1459 :     if (bytelen == 3)
                                203                 :     {
                                204            1459 :         CPTRGM(tptr, str);
                                205                 :     }
                                206                 :     else
                                207                 :     {
                                208                 :         pg_crc32    crc;
                                209                 : 
 3078 heikki.linnakangas        210 UBC           0 :         INIT_LEGACY_CRC32(crc);
                                211               0 :         COMP_LEGACY_CRC32(crc, str, bytelen);
                                212               0 :         FIN_LEGACY_CRC32(crc);
                                213                 : 
                                214                 :         /*
                                215                 :          * use only 3 upper bytes from crc, hope, it's good enough hashing
                                216                 :          */
 5261 teodor                    217               0 :         CPTRGM(tptr, &crc);
                                218                 :     }
 5261 teodor                    219 CBC        1459 : }
                                220                 : 
                                221                 : /*
                                222                 :  * Adds trigrams from words (already padded).
                                223                 :  */
                                224                 : static trgm *
 5050 bruce                     225          126397 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
                                226                 : {
                                227          126397 :     char       *ptr = str;
                                228                 : 
                                229          126397 :     if (charlen < 3)
 5261 teodor                    230              27 :         return tptr;
                                231                 : 
 3654 tgl                       232          126370 :     if (bytelen > charlen)
                                233                 :     {
                                234                 :         /* Find multibyte character boundaries and apply compact_trigram */
 5050 bruce                     235 UBC           0 :         int         lenfirst = pg_mblen(str),
                                236               0 :                     lenmiddle = pg_mblen(str + lenfirst),
                                237               0 :                     lenlast = pg_mblen(str + lenfirst + lenmiddle);
                                238                 : 
                                239               0 :         while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
                                240                 :         {
 3654 tgl                       241               0 :             compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
                                242                 : 
 5261 teodor                    243               0 :             ptr += lenfirst;
                                244               0 :             tptr++;
                                245                 : 
 5050 bruce                     246               0 :             lenfirst = lenmiddle;
                                247               0 :             lenmiddle = lenlast;
                                248               0 :             lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
                                249                 :         }
                                250                 :     }
                                251                 :     else
                                252                 :     {
                                253                 :         /* Fast path when there are no multibyte characters */
 5050 bruce                     254 CBC      126370 :         Assert(bytelen == charlen);
                                255                 : 
 5261 teodor                    256         1214148 :         while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
                                257                 :         {
                                258         1087778 :             CPTRGM(tptr, ptr);
                                259         1087778 :             ptr++;
                                260         1087778 :             tptr++;
                                261                 :         }
                                262                 :     }
                                263                 : 
                                264          126370 :     return tptr;
                                265                 : }
                                266                 : 
                                267                 : /*
                                268                 :  * Make array of trigrams without sorting and removing duplicate items.
                                269                 :  *
                                270                 :  * trg: where to return the array of trigrams.
                                271                 :  * str: source string, of length slen bytes.
                                272                 :  * bounds: where to return bounds of trigrams (if needed).
                                273                 :  *
                                274                 :  * Returns length of the generated array.
                                275                 :  */
                                276                 : static int
 1845                           277          113081 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
                                278                 : {
                                279                 :     trgm       *tptr;
                                280                 :     char       *buf;
                                281                 :     int         charlen,
                                282                 :                 bytelen;
                                283                 :     char       *bword,
                                284                 :                *eword;
                                285                 : 
 6797 bruce                     286          113081 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
 2580 teodor                    287               1 :         return 0;
                                288                 : 
                                289          113080 :     tptr = trg;
                                290                 : 
                                291                 :     /* Allocate a buffer for case-folded, blank-padded words */
 3373 tgl                       292          113080 :     buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
                                293                 : 
                                294                 :     if (LPADDING > 0)
                                295                 :     {
 6887 teodor                    296          113080 :         *buf = ' ';
                                297                 :         if (LPADDING > 1)
 6797 bruce                     298          113080 :             *(buf + 1) = ' ';
                                299                 :     }
                                300                 : 
 5261 teodor                    301          113080 :     eword = str;
 5050 bruce                     302          239413 :     while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
                                303                 :     {
                                304                 : #ifdef IGNORECASE
 5261 teodor                    305          126333 :         bword = lowerstr_with_len(bword, eword - bword);
                                306          126333 :         bytelen = strlen(bword);
                                307                 : #else
                                308                 :         bytelen = eword - bword;
                                309                 : #endif
                                310                 : 
                                311          126333 :         memcpy(buf + LPADDING, bword, bytelen);
                                312                 : 
                                313                 : #ifdef IGNORECASE
                                314          126333 :         pfree(bword);
                                315                 : #endif
                                316                 : 
 5050 bruce                     317          126333 :         buf[LPADDING + bytelen] = ' ';
                                318          126333 :         buf[LPADDING + bytelen + 1] = ' ';
                                319                 : 
                                320                 :         /* Calculate trigrams marking their bounds if needed */
 1845 teodor                    321          126333 :         if (bounds)
                                322           12400 :             bounds[tptr - trg] |= TRGM_BOUND_LEFT;
 5050 bruce                     323          126333 :         tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
                                324                 :                              charlen + LPADDING + RPADDING);
 1845 teodor                    325          126333 :         if (bounds)
                                326           12400 :             bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
                                327                 :     }
                                328                 : 
 6887                           329          113080 :     pfree(buf);
                                330                 : 
 2580                           331          113080 :     return tptr - trg;
                                332                 : }
                                333                 : 
                                334                 : /*
                                335                 :  * Guard against possible overflow in the palloc requests below.  (We
                                336                 :  * don't worry about the additive constants, since palloc can detect
                                337                 :  * requests that are a little above MaxAllocSize --- we just need to
                                338                 :  * prevent integer overflow in the multiplications.)
                                339                 :  */
                                340                 : static void
                                341          101010 : protect_out_of_mem(int slen)
                                342                 : {
                                343          101010 :     if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
                                344          101010 :         (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
 2580 teodor                    345 UBC           0 :         ereport(ERROR,
                                346                 :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                347                 :                  errmsg("out of memory")));
 2580 teodor                    348 CBC      101010 : }
                                349                 : 
                                350                 : /*
                                351                 :  * Make array of trigrams with sorting and removing duplicate items.
                                352                 :  *
                                353                 :  * str: source string, of length slen bytes.
                                354                 :  *
                                355                 :  * Returns the sorted array of unique trigrams.
                                356                 :  */
                                357                 : TRGM *
                                358           88829 : generate_trgm(char *str, int slen)
                                359                 : {
                                360                 :     TRGM       *trg;
                                361                 :     int         len;
                                362                 : 
                                363           88829 :     protect_out_of_mem(slen);
                                364                 : 
 2118 tgl                       365           88829 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
 2580 teodor                    366           88829 :     trg->flag = ARRKEY;
                                367                 : 
 1845                           368           88829 :     len = generate_trgm_only(GETARR(trg), str, slen, NULL);
 2580                           369           88829 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
                                370                 : 
                                371           88829 :     if (len == 0)
 6887                           372               4 :         return trg;
                                373                 : 
                                374                 :     /*
                                375                 :      * Make trigrams unique.
                                376                 :      */
 3373 tgl                       377           88825 :     if (len > 1)
                                378                 :     {
   61 peter                     379 GNC       88825 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
 1249 tmunro                    380 CBC       88825 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
                                381                 :     }
                                382                 : 
 5884 tgl                       383           88825 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
                                384                 : 
 6887 teodor                    385           88825 :     return trg;
                                386                 : }
                                387                 : 
                                388                 : /*
                                389                 :  * Make array of positional trigrams from two trigram arrays trg1 and trg2.
                                390                 :  *
                                391                 :  * trg1: trigram array of search pattern, of length len1. trg1 is required
                                392                 :  *       word which positions don't matter and replaced with -1.
                                393                 :  * trg2: trigram array of text, of length len2. trg2 is haystack where we
                                394                 :  *       search and have to store its positions.
                                395                 :  *
                                396                 :  * Returns concatenated trigram array.
                                397                 :  */
                                398                 : static pos_trgm *
 2580                           399           12126 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
                                400                 : {
                                401                 :     pos_trgm   *result;
                                402                 :     int         i,
 2495 rhaas                     403           12126 :                 len = len1 + len2;
                                404                 : 
 2580 teodor                    405           12126 :     result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
                                406                 : 
                                407          120864 :     for (i = 0; i < len1; i++)
                                408                 :     {
                                409          108738 :         memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
                                410          108738 :         result[i].index = -1;
                                411                 :     }
                                412                 : 
                                413          192225 :     for (i = 0; i < len2; i++)
                                414                 :     {
                                415          180099 :         memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
                                416          180099 :         result[i + len1].index = i;
                                417                 :     }
                                418                 : 
                                419           12126 :     return result;
                                420                 : }
                                421                 : 
                                422                 : /*
                                423                 :  * Compare position trigrams: compare trigrams first and position second.
                                424                 :  */
                                425                 : static int
                                426         1307800 : comp_ptrgm(const void *v1, const void *v2)
                                427                 : {
 2495 rhaas                     428         1307800 :     const pos_trgm *p1 = (const pos_trgm *) v1;
                                429         1307800 :     const pos_trgm *p2 = (const pos_trgm *) v2;
                                430                 :     int         cmp;
                                431                 : 
 2580 teodor                    432         1307800 :     cmp = CMPTRGM(p1->trg, p2->trg);
                                433         1307800 :     if (cmp != 0)
                                434         1268095 :         return cmp;
                                435                 : 
                                436           39705 :     if (p1->index < p2->index)
                                437           21365 :         return -1;
                                438           18340 :     else if (p1->index == p2->index)
 2580 teodor                    439 UBC           0 :         return 0;
                                440                 :     else
 2580 teodor                    441 CBC       18340 :         return 1;
                                442                 : }
                                443                 : 
                                444                 : /*
                                445                 :  * Iterative search function which calculates maximum similarity with word in
                                446                 :  * the string. Maximum similarity is only calculated only if the flag
                                447                 :  * WORD_SIMILARITY_CHECK_ONLY isn't set.
                                448                 :  *
                                449                 :  * trg2indexes: array which stores indexes of the array "found".
                                450                 :  * found: array which stores true of false values.
                                451                 :  * ulen1: count of unique trigrams of array "trg1".
                                452                 :  * len2: length of array "trg2" and array "trg2indexes".
                                453                 :  * len: length of the array "found".
                                454                 :  * flags: set of boolean flags parameterizing similarity calculation.
                                455                 :  * bounds: whether each trigram is left/right bound of word.
                                456                 :  *
                                457                 :  * Returns word similarity.
                                458                 :  */
                                459                 : static float4
 2580 teodor                    460 GIC       12126 : iterate_word_similarity(int *trg2indexes,
 2580 teodor                    461 ECB             :                         bool *found,
                                462                 :                         int ulen1,
                                463                 :                         int len2,
                                464                 :                         int len,
                                465                 :                         uint8 flags,
                                466                 :                         TrgmBound *bounds)
                                467                 : {
                                468                 :     int        *lastpos,
                                469                 :                 i,
 2580 teodor                    470 GIC       12126 :                 ulen2 = 0,
 2580 teodor                    471 CBC       12126 :                 count = 0,
                                472           12126 :                 upper = -1,
 1845 teodor                    473 ECB             :                 lower;
                                474                 :     float4      smlr_cur,
 2580 teodor                    475 GIC       12126 :                 smlr_max = 0.0f;
 1845 teodor                    476 ECB             :     double      threshold;
                                477                 : 
 1845 teodor                    478 GIC       12126 :     Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
 1845 teodor                    479 ECB             : 
                                480                 :     /* Select appropriate threshold */
 1845 teodor                    481 GIC       24252 :     threshold = (flags & WORD_SIMILARITY_STRICT) ?
 1809 tgl                       482 CBC       12126 :         strict_word_similarity_threshold :
 1809 tgl                       483 ECB             :         word_similarity_threshold;
                                484                 : 
                                485                 :     /*
                                486                 :      * Consider first trigram as initial lower bound for strict word
                                487                 :      * similarity, or initialize it later with first trigram present for plain
                                488                 :      * word similarity.
                                489                 :      */
 1845 teodor                    490 GIC       12126 :     lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
 2580 teodor                    491 ECB             : 
                                492                 :     /* Memorise last position of each trigram */
 2580 teodor                    493 GIC       12126 :     lastpos = (int *) palloc(sizeof(int) * len);
 2580 teodor                    494 CBC       12126 :     memset(lastpos, -1, sizeof(int) * len);
 2580 teodor                    495 ECB             : 
 2580 teodor                    496 GIC      183655 :     for (i = 0; i < len2; i++)
 2580 teodor                    497 ECB             :     {
                                498                 :         int         trgindex;
                                499                 : 
  216 dgustafsson               500 GNC      173313 :         CHECK_FOR_INTERRUPTS();
                                501                 : 
                                502                 :         /* Get index of next trigram */
                                503          173313 :         trgindex = trg2indexes[i];
                                504                 : 
 2580 teodor                    505 ECB             :         /* Update last position of this trigram */
 2580 teodor                    506 GIC      173313 :         if (lower >= 0 || found[trgindex])
                                507                 :         {
 2580 teodor                    508 CBC      135805 :             if (lastpos[trgindex] < 0)
                                509                 :             {
 2580 teodor                    510 GIC      133952 :                 ulen2++;
 2580 teodor                    511 CBC      133952 :                 if (found[trgindex])
 2580 teodor                    512 GIC       30756 :                     count++;
 2580 teodor                    513 ECB             :             }
 2580 teodor                    514 GIC      135805 :             lastpos[trgindex] = i;
 2580 teodor                    515 ECB             :         }
                                516                 : 
 1845                           517                 :         /*
                                518                 :          * Adjust upper bound if trigram is upper bound of word for strict
                                519                 :          * word similarity, or if trigram is present in required substring for
                                520                 :          * plain word similarity
                                521                 :          */
 1845 teodor                    522 GIC      250355 :         if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
 1809 tgl                       523           77042 :             : found[trgindex])
                                524                 :         {
                                525                 :             int         prev_lower,
                                526                 :                         tmp_ulen2,
 2495 rhaas                     527 ECB             :                         tmp_lower,
                                528                 :                         tmp_count;
                                529                 : 
 2580 teodor                    530 GIC       25638 :             upper = i;
                                531           25638 :             if (lower == -1)
                                532                 :             {
                                533            4695 :                 lower = i;
                                534            4695 :                 ulen2 = 1;
 2580 teodor                    535 ECB             :             }
                                536                 : 
 2580 teodor                    537 GIC       25638 :             smlr_cur = CALCSML(count, ulen1, ulen2);
 2580 teodor                    538 ECB             : 
 1845                           539                 :             /* Also try to adjust lower bound for greater similarity */
 2580 teodor                    540 GIC       25638 :             tmp_count = count;
                                541           25638 :             tmp_ulen2 = ulen2;
 2580 teodor                    542 CBC       25638 :             prev_lower = lower;
 2580 teodor                    543 GIC      208652 :             for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
                                544                 :             {
 1845 teodor                    545 ECB             :                 float       smlr_tmp;
 2495 rhaas                     546                 :                 int         tmp_trgindex;
 2580 teodor                    547                 : 
                                548                 :                 /*
                                549                 :                  * Adjust lower bound only if trigram is lower bound of word
                                550                 :                  * for strict word similarity, or consider every trigram as
                                551                 :                  * lower bound for plain word similarity.
                                552                 :                  */
 1845 teodor                    553 GIC      184798 :                 if (!(flags & WORD_SIMILARITY_STRICT)
                                554          145233 :                     || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
                                555                 :                 {
                                556           59704 :                     smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
                                557           59704 :                     if (smlr_tmp > smlr_cur)
 1845 teodor                    558 ECB             :                     {
 1845 teodor                    559 CBC        3511 :                         smlr_cur = smlr_tmp;
 1845 teodor                    560 GIC        3511 :                         ulen2 = tmp_ulen2;
 1845 teodor                    561 CBC        3511 :                         lower = tmp_lower;
                                562            3511 :                         count = tmp_count;
                                563                 :                     }
 1845 teodor                    564 ECB             : 
                                565                 :                     /*
                                566                 :                      * If we only check that word similarity is greater than
                                567                 :                      * threshold we do not need to calculate a maximum
                                568                 :                      * similarity.
                                569                 :                      */
 1845 teodor                    570 GIC       59704 :                     if ((flags & WORD_SIMILARITY_CHECK_ONLY)
                                571           37114 :                         && smlr_cur >= threshold)
                                572            1784 :                         break;
                                573                 :                 }
                                574                 : 
 2580 teodor                    575 CBC      183014 :                 tmp_trgindex = trg2indexes[tmp_lower];
                                576          183014 :                 if (lastpos[tmp_trgindex] == tmp_lower)
 2580 teodor                    577 ECB             :                 {
 2580 teodor                    578 GIC      180753 :                     tmp_ulen2--;
                                579          180753 :                     if (found[tmp_trgindex])
 2580 teodor                    580 CBC       46591 :                         tmp_count--;
 2580 teodor                    581 ECB             :                 }
                                582                 :             }
                                583                 : 
 2580 teodor                    584 CBC       25638 :             smlr_max = Max(smlr_max, smlr_cur);
 2495 rhaas                     585 ECB             : 
                                586                 :             /*
                                587                 :              * if we only check that word similarity is greater than threshold
                                588                 :              * we do not need to calculate a maximum similarity.
 2580 teodor                    589                 :              */
 1845 teodor                    590 GIC       25638 :             if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
 2580                           591            1784 :                 break;
                                592                 : 
                                593           40602 :             for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
                                594                 :             {
 2495 rhaas                     595 ECB             :                 int         tmp_trgindex;
                                596                 : 
 2580 teodor                    597 GIC       16748 :                 tmp_trgindex = trg2indexes[tmp_lower];
 2580 teodor                    598 CBC       16748 :                 if (lastpos[tmp_trgindex] == tmp_lower)
 2580 teodor                    599 GIC       16000 :                     lastpos[tmp_trgindex] = -1;
                                600                 :             }
                                601                 :         }
 2580 teodor                    602 ECB             :     }
                                603                 : 
 2580 teodor                    604 CBC       12126 :     pfree(lastpos);
                                605                 : 
 2580 teodor                    606 GIC       12126 :     return smlr_max;
                                607                 : }
                                608                 : 
 2580 teodor                    609 ECB             : /*
                                610                 :  * Calculate word similarity.
                                611                 :  * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
                                612                 :  * are used to calculate word similarity using iterate_word_similarity().
                                613                 :  *
                                614                 :  * "trg2indexes" is array which stores indexes of the array "found".
                                615                 :  * In other words:
                                616                 :  * trg2indexes[j] = i;
                                617                 :  * found[i] = true (or false);
                                618                 :  * If found[i] == true then there is trigram trg2[j] in array "trg1".
                                619                 :  * If found[i] == false then there is not trigram trg2[j] in array "trg1".
                                620                 :  *
                                621                 :  * str1: search pattern string, of length slen1 bytes.
                                622                 :  * str2: text in which we are looking for a word, of length slen2 bytes.
                                623                 :  * flags: set of boolean flags parameterizing similarity calculation.
                                624                 :  *
                                625                 :  * Returns word similarity.
                                626                 :  */
                                627                 : static float4
 2580 teodor                    628 GIC       12126 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
                                629                 :                      uint8 flags)
                                630                 : {
                                631                 :     bool       *found;
                                632                 :     pos_trgm   *ptrg;
 2580 teodor                    633 ECB             :     trgm       *trg1;
                                634                 :     trgm       *trg2;
                                635                 :     int         len1,
                                636                 :                 len2,
                                637                 :                 len,
                                638                 :                 i,
                                639                 :                 j,
                                640                 :                 ulen1;
                                641                 :     int        *trg2indexes;
                                642                 :     float4      result;
                                643                 :     TrgmBound  *bounds;
                                644                 : 
 2580 teodor                    645 GIC       12126 :     protect_out_of_mem(slen1 + slen2);
                                646                 : 
                                647                 :     /* Make positional trigrams */
 2118 tgl                       648           12126 :     trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
                                649           12126 :     trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
 1845 teodor                    650 CBC       12126 :     if (flags & WORD_SIMILARITY_STRICT)
 1845 teodor                    651 GIC        6662 :         bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
                                652                 :     else
 1845 teodor                    653 CBC        5464 :         bounds = NULL;
 2580 teodor                    654 ECB             : 
 1845 teodor                    655 CBC       12126 :     len1 = generate_trgm_only(trg1, str1, slen1, NULL);
                                656           12126 :     len2 = generate_trgm_only(trg2, str2, slen2, bounds);
                                657                 : 
 2580                           658           12126 :     ptrg = make_positional_trgm(trg1, len1, trg2, len2);
 2580 teodor                    659 GIC       12126 :     len = len1 + len2;
 2580 teodor                    660 CBC       12126 :     qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
 2580 teodor                    661 ECB             : 
 2580 teodor                    662 GIC       12126 :     pfree(trg1);
 2580 teodor                    663 CBC       12126 :     pfree(trg2);
 2580 teodor                    664 ECB             : 
                                665                 :     /*
                                666                 :      * Merge positional trigrams array: enumerate each trigram and find its
                                667                 :      * presence in required word.
                                668                 :      */
 2580 teodor                    669 GIC       12126 :     trg2indexes = (int *) palloc(sizeof(int) * len2);
                                670           12126 :     found = (bool *) palloc0(sizeof(bool) * len);
                                671                 : 
                                672           12126 :     ulen1 = 0;
                                673           12126 :     j = 0;
 2580 teodor                    674 CBC      300963 :     for (i = 0; i < len; i++)
 2580 teodor                    675 ECB             :     {
 2580 teodor                    676 GIC      288837 :         if (i > 0)
 2580 teodor                    677 ECB             :         {
 2495 rhaas                     678 CBC      276711 :             int         cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
 2495 rhaas                     679 ECB             : 
 2580 teodor                    680 GIC      276711 :             if (cmp != 0)
 2580 teodor                    681 ECB             :             {
 2580 teodor                    682 GIC      242510 :                 if (found[j])
 2580 teodor                    683 CBC      101138 :                     ulen1++;
 2580 teodor                    684 GIC      242510 :                 j++;
 2580 teodor                    685 ECB             :             }
                                686                 :         }
                                687                 : 
 2580 teodor                    688 CBC      288837 :         if (ptrg[i].index >= 0)
 2580 teodor                    689 ECB             :         {
 2580 teodor                    690 GIC      180099 :             trg2indexes[ptrg[i].index] = j;
                                691                 :         }
                                692                 :         else
 2580 teodor                    693 ECB             :         {
 2580 teodor                    694 GIC      108738 :             found[j] = true;
 2580 teodor                    695 ECB             :         }
                                696                 :     }
 2580 teodor                    697 GIC       12126 :     if (found[j])
                                698            7600 :         ulen1++;
 2580 teodor                    699 ECB             : 
                                700                 :     /* Run iterative procedure to find maximum similarity with word */
 2580 teodor                    701 GIC       12126 :     result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
 1845 teodor                    702 ECB             :                                      flags, bounds);
 2580                           703                 : 
 2580 teodor                    704 GIC       12126 :     pfree(trg2indexes);
                                705           12126 :     pfree(found);
 2580 teodor                    706 CBC       12126 :     pfree(ptrg);
                                707                 : 
 2580 teodor                    708 GIC       12126 :     return result;
 2580 teodor                    709 ECB             : }
                                710                 : 
                                711                 : 
                                712                 : /*
 2578 rhaas                     713                 :  * Extract the next non-wildcard part of a search string, i.e. a word bounded
                                714                 :  * by '_' or '%' meta-characters, non-word characters or string end.
                                715                 :  *
                                716                 :  * str: source string, of length lenstr bytes (need not be null-terminated)
                                717                 :  * buf: where to return the substring (must be long enough)
                                718                 :  * *bytelen: receives byte length of the found substring
                                719                 :  * *charlen: receives character length of the found substring
                                720                 :  *
                                721                 :  * Returns pointer to end+1 of the found substring in the source string.
                                722                 :  * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
                                723                 :  *
                                724                 :  * If the found word is bounded by non-word characters or string boundaries
                                725                 :  * then this function will include corresponding padding spaces into buf.
                                726                 :  */
                                727                 : static const char *
 4451 tgl                       728 GIC         119 : get_wildcard_part(const char *str, int lenstr,
                                729                 :                   char *buf, int *bytelen, int *charlen)
                                730                 : {
                                731             119 :     const char *beginword = str;
                                732                 :     const char *endword;
 4451 tgl                       733 CBC         119 :     char       *s = buf;
 3884 tgl                       734 GIC         119 :     bool        in_leading_wildcard_meta = false;
                                735             119 :     bool        in_trailing_wildcard_meta = false;
 4382 bruce                     736 CBC         119 :     bool        in_escape = false;
                                737                 :     int         clen;
 4451 tgl                       738 ECB             : 
                                739                 :     /*
 3884                           740                 :      * Find the first word character, remembering whether preceding character
                                741                 :      * was wildcard meta-character.  Note that the in_escape state persists
                                742                 :      * from this loop to the next one, since we may exit at a word character
                                743                 :      * that is in_escape.
                                744                 :      */
 4451 tgl                       745 GIC         241 :     while (beginword - str < lenstr)
                                746                 :     {
                                747             186 :         if (in_escape)
                                748                 :         {
 3652                           749               3 :             if (ISWORDCHR(beginword))
 4451 tgl                       750 CBC           3 :                 break;
 3884 tgl                       751 UIC           0 :             in_escape = false;
 3884 tgl                       752 LBC           0 :             in_leading_wildcard_meta = false;
                                753                 :         }
 4451 tgl                       754 ECB             :         else
                                755                 :         {
 4451 tgl                       756 GBC         183 :             if (ISESCAPECHAR(beginword))
                                757               3 :                 in_escape = true;
 4451 tgl                       758 GIC         180 :             else if (ISWILDCARDCHAR(beginword))
 3884                           759             104 :                 in_leading_wildcard_meta = true;
 3652                           760              76 :             else if (ISWORDCHR(beginword))
 4451 tgl                       761 CBC          61 :                 break;
 4451 tgl                       762 ECB             :             else
 3884 tgl                       763 CBC          15 :                 in_leading_wildcard_meta = false;
 4451 tgl                       764 ECB             :         }
 4451 tgl                       765 CBC         122 :         beginword += pg_mblen(beginword);
 4451 tgl                       766 ECB             :     }
                                767                 : 
                                768                 :     /*
                                769                 :      * Handle string end.
                                770                 :      */
 4451 tgl                       771 GIC         119 :     if (beginword - str >= lenstr)
                                772              55 :         return NULL;
                                773                 : 
                                774                 :     /*
                                775                 :      * Add left padding spaces if preceding character wasn't wildcard
 4451 tgl                       776 ECB             :      * meta-character.
                                777                 :      */
 4451 tgl                       778 GIC          64 :     *charlen = 0;
 3884                           779              64 :     if (!in_leading_wildcard_meta)
                                780                 :     {
                                781                 :         if (LPADDING > 0)
                                782                 :         {
 4451 tgl                       783 CBC          15 :             *s++ = ' ';
                                784              15 :             (*charlen)++;
                                785                 :             if (LPADDING > 1)
                                786                 :             {
 4451 tgl                       787 GIC          15 :                 *s++ = ' ';
 4451 tgl                       788 CBC          15 :                 (*charlen)++;
 4451 tgl                       789 ECB             :             }
                                790                 :         }
                                791                 :     }
                                792                 : 
                                793                 :     /*
                                794                 :      * Copy data into buf until wildcard meta-character, non-word character or
                                795                 :      * string boundary.  Strip escapes during copy.
                                796                 :      */
 4451 tgl                       797 GIC          64 :     endword = beginword;
                                798             244 :     while (endword - str < lenstr)
                                799                 :     {
                                800             244 :         clen = pg_mblen(endword);
                                801             244 :         if (in_escape)
 4451 tgl                       802 ECB             :         {
 3652 tgl                       803 CBC           3 :             if (ISWORDCHR(endword))
                                804                 :             {
 4451                           805               3 :                 memcpy(s, endword, clen);
                                806               3 :                 (*charlen)++;
 4451 tgl                       807 GIC           3 :                 s += clen;
 4451 tgl                       808 ECB             :             }
                                809                 :             else
 3884                           810                 :             {
                                811                 :                 /*
 3602 bruce                     812                 :                  * Back up endword to the escape character when stopping at an
                                813                 :                  * escaped char, so that subsequent get_wildcard_part will
                                814                 :                  * restart from the escape character.  We assume here that
                                815                 :                  * escape chars are single-byte.
                                816                 :                  */
 3884 tgl                       817 UIC           0 :                 endword--;
 4451                           818               0 :                 break;
                                819                 :             }
 3884 tgl                       820 GIC           3 :             in_escape = false;
                                821                 :         }
 4451 tgl                       822 EUB             :         else
                                823                 :         {
 4451 tgl                       824 GIC         241 :             if (ISESCAPECHAR(endword))
 4451 tgl                       825 LBC           0 :                 in_escape = true;
 4451 tgl                       826 GIC         241 :             else if (ISWILDCARDCHAR(endword))
                                827                 :             {
 3884                           828              55 :                 in_trailing_wildcard_meta = true;
 4451 tgl                       829 CBC          55 :                 break;
 4451 tgl                       830 EUB             :             }
 3652 tgl                       831 CBC         186 :             else if (ISWORDCHR(endword))
                                832                 :             {
 4451                           833             177 :                 memcpy(s, endword, clen);
                                834             177 :                 (*charlen)++;
 4451 tgl                       835 GIC         177 :                 s += clen;
 4451 tgl                       836 ECB             :             }
                                837                 :             else
 4451 tgl                       838 CBC           9 :                 break;
 4451 tgl                       839 ECB             :         }
 4451 tgl                       840 CBC         180 :         endword += clen;
                                841                 :     }
                                842                 : 
 4451 tgl                       843 ECB             :     /*
                                844                 :      * Add right padding spaces if next character isn't wildcard
                                845                 :      * meta-character.
                                846                 :      */
 3884 tgl                       847 GIC          64 :     if (!in_trailing_wildcard_meta)
                                848                 :     {
                                849                 :         if (RPADDING > 0)
                                850                 :         {
 4451                           851               9 :             *s++ = ' ';
 4451 tgl                       852 CBC           9 :             (*charlen)++;
                                853                 :             if (RPADDING > 1)
                                854                 :             {
                                855                 :                 *s++ = ' ';
 4451 tgl                       856 ECB             :                 (*charlen)++;
                                857                 :             }
                                858                 :         }
                                859                 :     }
                                860                 : 
 4451 tgl                       861 GIC          64 :     *bytelen = s - buf;
                                862              64 :     return endword;
                                863                 : }
                                864                 : 
                                865                 : /*
 4451 tgl                       866 ECB             :  * Generates trigrams for wildcard search string.
                                867                 :  *
                                868                 :  * Returns array of trigrams that must occur in any string that matches the
                                869                 :  * wildcard string.  For example, given pattern "a%bcd%" the trigrams
                                870                 :  * " a", "bcd" would be extracted.
                                871                 :  */
                                872                 : TRGM *
 4451 tgl                       873 GIC          55 : generate_wildcard_trgm(const char *str, int slen)
                                874                 : {
                                875                 :     TRGM       *trg;
                                876                 :     char       *buf,
                                877                 :                *buf2;
 4451 tgl                       878 ECB             :     trgm       *tptr;
                                879                 :     int         len,
                                880                 :                 charlen,
                                881                 :                 bytelen;
                                882                 :     const char *eword;
                                883                 : 
 2580 teodor                    884 GIC          55 :     protect_out_of_mem(slen);
                                885                 : 
 2118 tgl                       886              55 :     trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
 4451                           887              55 :     trg->flag = ARRKEY;
                                888              55 :     SET_VARSIZE(trg, TRGMHDRSIZE);
 4451 tgl                       889 ECB             : 
 4451 tgl                       890 GIC          55 :     if (slen + LPADDING + RPADDING < 3 || slen == 0)
 4451 tgl                       891 LBC           0 :         return trg;
 4451 tgl                       892 ECB             : 
 4451 tgl                       893 CBC          55 :     tptr = GETARR(trg);
                                894                 : 
 3373 tgl                       895 ECB             :     /* Allocate a buffer for blank-padded, but not yet case-folded, words */
 4451 tgl                       896 GBC          55 :     buf = palloc(sizeof(char) * (slen + 4));
                                897                 : 
 4451 tgl                       898 ECB             :     /*
                                899                 :      * Extract trigrams from each substring extracted by get_wildcard_part.
                                900                 :      */
 4451 tgl                       901 CBC          55 :     eword = str;
 4451 tgl                       902 GIC         119 :     while ((eword = get_wildcard_part(eword, slen - (eword - str),
                                903             119 :                                       buf, &bytelen, &charlen)) != NULL)
                                904                 :     {
                                905                 : #ifdef IGNORECASE
 4451 tgl                       906 CBC          64 :         buf2 = lowerstr_with_len(buf, bytelen);
                                907              64 :         bytelen = strlen(buf2);
 4451 tgl                       908 ECB             : #else
                                909                 :         buf2 = buf;
                                910                 : #endif
                                911                 : 
                                912                 :         /*
                                913                 :          * count trigrams
                                914                 :          */
 4451 tgl                       915 GIC          64 :         tptr = make_trigrams(tptr, buf2, bytelen, charlen);
                                916                 : 
                                917                 : #ifdef IGNORECASE
                                918              64 :         pfree(buf2);
                                919                 : #endif
 4451 tgl                       920 ECB             :     }
                                921                 : 
 4451 tgl                       922 GIC          55 :     pfree(buf);
 4451 tgl                       923 ECB             : 
 4451 tgl                       924 GIC          55 :     if ((len = tptr - GETARR(trg)) == 0)
                                925              24 :         return trg;
                                926                 : 
 4451 tgl                       927 ECB             :     /*
                                928                 :      * Make trigrams unique.
                                929                 :      */
 3373 tgl                       930 CBC          31 :     if (len > 1)
                                931                 :     {
   61 peter                     932 GNC          17 :         qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
 1249 tmunro                    933 GIC          17 :         len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
                                934                 :     }
 4451 tgl                       935 ECB             : 
 4451 tgl                       936 GIC          31 :     SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
 4451 tgl                       937 ECB             : 
 4451 tgl                       938 CBC          31 :     return trg;
                                939                 : }
                                940                 : 
 5261 teodor                    941 ECB             : uint32
 5261 teodor                    942 GIC       34773 : trgm2int(trgm *ptr)
 5261 teodor                    943 ECB             : {
 5050 bruce                     944 GIC       34773 :     uint32      val = 0;
                                945                 : 
                                946           34773 :     val |= *(((unsigned char *) ptr));
 5261 teodor                    947 CBC       34773 :     val <<= 8;
 5050 bruce                     948 GIC       34773 :     val |= *(((unsigned char *) ptr) + 1);
 5261 teodor                    949 CBC       34773 :     val <<= 8;
 5050 bruce                     950 GIC       34773 :     val |= *(((unsigned char *) ptr) + 2);
 5261 teodor                    951 ECB             : 
 5261 teodor                    952 CBC       34773 :     return val;
 5261 teodor                    953 ECB             : }
 6887                           954                 : 
                                955                 : Datum
 6797 bruce                     956 GIC           7 : show_trgm(PG_FUNCTION_ARGS)
 6797 bruce                     957 ECB             : {
 2219 noah                      958 GIC           7 :     text       *in = PG_GETARG_TEXT_PP(0);
                                959                 :     TRGM       *trg;
                                960                 :     Datum      *d;
 6797 bruce                     961 ECB             :     ArrayType  *a;
                                962                 :     trgm       *ptr;
 5750 tgl                       963                 :     int         i;
                                964                 : 
 2219 noah                      965 GIC           7 :     trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
 6797 bruce                     966               7 :     d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
                                967                 : 
 5750 tgl                       968              44 :     for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
                                969                 :     {
 5050 bruce                     970 CBC          37 :         text       *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
 6797 bruce                     971 ECB             : 
 5050 bruce                     972 GIC          37 :         if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
 5261 teodor                    973 ECB             :         {
 5261 teodor                    974 UIC           0 :             snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
 5261 teodor                    975 LBC           0 :             SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
                                976                 :         }
 5261 teodor                    977 ECB             :         else
                                978                 :         {
 5261 teodor                    979 GBC          37 :             SET_VARSIZE(item, VARHDRSZ + 3);
                                980              37 :             CPTRGM(VARDATA(item), ptr);
                                981                 :         }
 5750 tgl                       982 GIC          37 :         d[i] = PointerGetDatum(item);
                                983                 :     }
 6887 teodor                    984 ECB             : 
  282 peter                     985 GNC           7 :     a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
                                986                 : 
 5750 tgl                       987 CBC          44 :     for (i = 0; i < ARRNELEM(trg); i++)
                                988              37 :         pfree(DatumGetPointer(d[i]));
                                989                 : 
 6887 teodor                    990               7 :     pfree(d);
                                991               7 :     pfree(trg);
 6797 bruce                     992               7 :     PG_FREE_IF_COPY(in, 0);
                                993                 : 
 6887 teodor                    994               7 :     PG_RETURN_POINTER(a);
                                995                 : }
                                996                 : 
                                997                 : float4
 2580                           998           69791 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
                                999                 : {
                               1000                 :     trgm       *ptr1,
                               1001                 :                *ptr2;
 6797 bruce                    1002           69791 :     int         count = 0;
                               1003                 :     int         len1,
                               1004                 :                 len2;
                               1005                 : 
 6887 teodor                   1006           69791 :     ptr1 = GETARR(trg1);
                               1007           69791 :     ptr2 = GETARR(trg2);
                               1008                 : 
                               1009           69791 :     len1 = ARRNELEM(trg1);
                               1010           69791 :     len2 = ARRNELEM(trg2);
                               1011                 : 
                               1012                 :     /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
 3707 tgl                      1013           69791 :     if (len1 <= 0 || len2 <= 0)
                               1014               1 :         return (float4) 0.0;
                               1015                 : 
 6797 bruce                    1016          891582 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
                               1017                 :     {
                               1018          821792 :         int         res = CMPTRGM(ptr1, ptr2);
                               1019                 : 
                               1020          821792 :         if (res < 0)
 6887 teodor                   1021          189653 :             ptr1++;
 6797 bruce                    1022          632139 :         else if (res > 0)
 6887 teodor                   1023          220022 :             ptr2++;
                               1024                 :         else
                               1025                 :         {
                               1026          412117 :             ptr1++;
                               1027          412117 :             ptr2++;
                               1028          412117 :             count++;
                               1029                 :         }
                               1030                 :     }
                               1031                 : 
                               1032                 :     /*
                               1033                 :      * If inexact then len2 is equal to count, because we don't know actual
                               1034                 :      * length of second string in inexact search and we can assume that count
                               1035                 :      * is a lower bound of len2.
                               1036                 :      */
 2580                          1037           69790 :     return CALCSML(count, len1, inexact ? count : len2);
                               1038                 : }
                               1039                 : 
                               1040                 : 
                               1041                 : /*
                               1042                 :  * Returns whether trg2 contains all trigrams in trg1.
                               1043                 :  * This relies on the trigram arrays being sorted.
                               1044                 :  */
                               1045                 : bool
 4451 tgl                      1046             190 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
                               1047                 : {
                               1048                 :     trgm       *ptr1,
                               1049                 :                *ptr2;
                               1050                 :     int         len1,
                               1051                 :                 len2;
                               1052                 : 
                               1053             190 :     ptr1 = GETARR(trg1);
                               1054             190 :     ptr2 = GETARR(trg2);
                               1055                 : 
                               1056             190 :     len1 = ARRNELEM(trg1);
                               1057             190 :     len2 = ARRNELEM(trg2);
                               1058                 : 
                               1059             622 :     while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
                               1060                 :     {
                               1061             599 :         int         res = CMPTRGM(ptr1, ptr2);
                               1062                 : 
                               1063             599 :         if (res < 0)
                               1064             167 :             return false;
                               1065             432 :         else if (res > 0)
                               1066             320 :             ptr2++;
                               1067                 :         else
                               1068                 :         {
                               1069             112 :             ptr1++;
                               1070             112 :             ptr2++;
                               1071                 :         }
                               1072                 :     }
                               1073              23 :     if (ptr1 - GETARR(trg1) < len1)
                               1074               4 :         return false;
                               1075                 :     else
                               1076              19 :         return true;
                               1077                 : }
                               1078                 : 
                               1079                 : /*
                               1080                 :  * Return a palloc'd boolean array showing, for each trigram in "query",
                               1081                 :  * whether it is present in the trigram array "key".
                               1082                 :  * This relies on the "key" array being sorted, but "query" need not be.
                               1083                 :  */
                               1084                 : bool *
 3651                          1085            2150 : trgm_presence_map(TRGM *query, TRGM *key)
                               1086                 : {
                               1087                 :     bool       *result;
                               1088            2150 :     trgm       *ptrq = GETARR(query),
                               1089            2150 :                *ptrk = GETARR(key);
                               1090            2150 :     int         lenq = ARRNELEM(query),
                               1091            2150 :                 lenk = ARRNELEM(key),
                               1092                 :                 i;
                               1093                 : 
                               1094            2150 :     result = (bool *) palloc0(lenq * sizeof(bool));
                               1095                 : 
                               1096                 :     /* for each query trigram, do a binary search in the key array */
                               1097          507560 :     for (i = 0; i < lenq; i++)
                               1098                 :     {
                               1099          505410 :         int         lo = 0;
                               1100          505410 :         int         hi = lenk;
                               1101                 : 
                               1102         2373653 :         while (lo < hi)
                               1103                 :         {
                               1104         1876282 :             int         mid = (lo + hi) / 2;
                               1105         1876282 :             int         res = CMPTRGM(ptrq, ptrk + mid);
                               1106                 : 
                               1107         1876282 :             if (res < 0)
                               1108          784082 :                 hi = mid;
                               1109         1092200 :             else if (res > 0)
                               1110         1084161 :                 lo = mid + 1;
                               1111                 :             else
                               1112                 :             {
                               1113            8039 :                 result[i] = true;
                               1114            8039 :                 break;
                               1115                 :             }
                               1116                 :         }
                               1117          505410 :         ptrq++;
                               1118                 :     }
                               1119                 : 
                               1120            2150 :     return result;
                               1121                 : }
                               1122                 : 
                               1123                 : Datum
 6797 bruce                    1124           31452 : similarity(PG_FUNCTION_ARGS)
                               1125                 : {
 2219 noah                     1126           31452 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1127           31452 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1128                 :     TRGM       *trg1,
                               1129                 :                *trg2;
                               1130                 :     float4      res;
                               1131                 : 
                               1132           31452 :     trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
                               1133           31452 :     trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
                               1134                 : 
 2580 teodor                   1135           31452 :     res = cnt_sml(trg1, trg2, false);
                               1136                 : 
 6887                          1137           31452 :     pfree(trg1);
                               1138           31452 :     pfree(trg2);
 6797 bruce                    1139           31452 :     PG_FREE_IF_COPY(in1, 0);
                               1140           31452 :     PG_FREE_IF_COPY(in2, 1);
                               1141                 : 
 6887 teodor                   1142           31452 :     PG_RETURN_FLOAT4(res);
                               1143                 : }
                               1144                 : 
                               1145                 : Datum
 2580                          1146             902 : word_similarity(PG_FUNCTION_ARGS)
                               1147                 : {
                               1148             902 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1149             902 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1150                 :     float4      res;
                               1151                 : 
                               1152            1804 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 2495 rhaas                    1153            1804 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1154                 :                                0);
                               1155                 : 
 1845 teodor                   1156             902 :     PG_FREE_IF_COPY(in1, 0);
                               1157             902 :     PG_FREE_IF_COPY(in2, 1);
                               1158             902 :     PG_RETURN_FLOAT4(res);
                               1159                 : }
                               1160                 : 
                               1161                 : Datum
                               1162             882 : strict_word_similarity(PG_FUNCTION_ARGS)
                               1163                 : {
                               1164             882 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1165             882 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1166                 :     float4      res;
                               1167                 : 
                               1168            1764 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1169            1764 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1170                 :                                WORD_SIMILARITY_STRICT);
                               1171                 : 
 2580                          1172             882 :     PG_FREE_IF_COPY(in1, 0);
                               1173             882 :     PG_FREE_IF_COPY(in2, 1);
                               1174             882 :     PG_RETURN_FLOAT4(res);
                               1175                 : }
                               1176                 : 
                               1177                 : Datum
 4509 tgl                      1178            1004 : similarity_dist(PG_FUNCTION_ARGS)
                               1179                 : {
                               1180            1004 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
                               1181                 :                                                          PG_GETARG_DATUM(0),
                               1182                 :                                                          PG_GETARG_DATUM(1)));
                               1183                 : 
                               1184            1004 :     PG_RETURN_FLOAT4(1.0 - res);
                               1185                 : }
                               1186                 : 
                               1187                 : Datum
 6797 bruce                    1188            6000 : similarity_op(PG_FUNCTION_ARGS)
                               1189                 : {
 4509 tgl                      1190            6000 :     float4      res = DatumGetFloat4(DirectFunctionCall2(similarity,
                               1191                 :                                                          PG_GETARG_DATUM(0),
                               1192                 :                                                          PG_GETARG_DATUM(1)));
                               1193                 : 
 2580 teodor                   1194            6000 :     PG_RETURN_BOOL(res >= similarity_threshold);
                               1195                 : }
                               1196                 : 
                               1197                 : Datum
                               1198            1924 : word_similarity_op(PG_FUNCTION_ARGS)
                               1199                 : {
                               1200            1924 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1201            1924 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1202                 :     float4      res;
                               1203                 : 
                               1204            3848 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 2495 rhaas                    1205            3848 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1206                 :                                WORD_SIMILARITY_CHECK_ONLY);
                               1207                 : 
 2580 teodor                   1208            1924 :     PG_FREE_IF_COPY(in1, 0);
                               1209            1924 :     PG_FREE_IF_COPY(in2, 1);
                               1210            1924 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
                               1211                 : }
                               1212                 : 
                               1213                 : Datum
                               1214            1924 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
                               1215                 : {
                               1216            1924 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1217            1924 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1218                 :     float4      res;
                               1219                 : 
                               1220            3848 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 2495 rhaas                    1221            3848 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1222                 :                                WORD_SIMILARITY_CHECK_ONLY);
                               1223                 : 
 2580 teodor                   1224            1924 :     PG_FREE_IF_COPY(in1, 0);
                               1225            1924 :     PG_FREE_IF_COPY(in2, 1);
                               1226            1924 :     PG_RETURN_BOOL(res >= word_similarity_threshold);
                               1227                 : }
                               1228                 : 
                               1229                 : Datum
 2580 teodor                   1230 UBC           0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
                               1231                 : {
                               1232               0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1233               0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1234                 :     float4      res;
                               1235                 : 
                               1236               0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
 2495 rhaas                    1237               0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1238                 :                                0);
                               1239                 : 
 2580 teodor                   1240               0 :     PG_FREE_IF_COPY(in1, 0);
                               1241               0 :     PG_FREE_IF_COPY(in2, 1);
                               1242               0 :     PG_RETURN_FLOAT4(1.0 - res);
                               1243                 : }
                               1244                 : 
                               1245                 : Datum
 2580 teodor                   1246 CBC         714 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
                               1247                 : {
                               1248             714 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1249             714 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1250                 :     float4      res;
                               1251                 : 
                               1252            1428 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
 2495 rhaas                    1253            1428 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1254                 :                                0);
                               1255                 : 
 1845 teodor                   1256             714 :     PG_FREE_IF_COPY(in1, 0);
                               1257             714 :     PG_FREE_IF_COPY(in2, 1);
                               1258             714 :     PG_RETURN_FLOAT4(1.0 - res);
                               1259                 : }
                               1260                 : 
                               1261                 : Datum
                               1262            2530 : strict_word_similarity_op(PG_FUNCTION_ARGS)
                               1263                 : {
                               1264            2530 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1265            2530 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1266                 :     float4      res;
                               1267                 : 
                               1268            5060 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1269            5060 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1270                 :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
                               1271                 : 
                               1272            2530 :     PG_FREE_IF_COPY(in1, 0);
                               1273            2530 :     PG_FREE_IF_COPY(in2, 1);
                               1274            2530 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
                               1275                 : }
                               1276                 : 
                               1277                 : Datum
                               1278            2530 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
                               1279                 : {
                               1280            2530 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1281            2530 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1282                 :     float4      res;
                               1283                 : 
                               1284            5060 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1285            5060 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1286                 :                                WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
                               1287                 : 
                               1288            2530 :     PG_FREE_IF_COPY(in1, 0);
                               1289            2530 :     PG_FREE_IF_COPY(in2, 1);
                               1290            2530 :     PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
                               1291                 : }
                               1292                 : 
                               1293                 : Datum
 1845 teodor                   1294 UBC           0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
                               1295                 : {
                               1296               0 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1297               0 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1298                 :     float4      res;
                               1299                 : 
                               1300               0 :     res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1301               0 :                                VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1302                 :                                WORD_SIMILARITY_STRICT);
                               1303                 : 
                               1304               0 :     PG_FREE_IF_COPY(in1, 0);
                               1305               0 :     PG_FREE_IF_COPY(in2, 1);
                               1306               0 :     PG_RETURN_FLOAT4(1.0 - res);
                               1307                 : }
                               1308                 : 
                               1309                 : Datum
 1845 teodor                   1310 CBC         720 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
                               1311                 : {
                               1312             720 :     text       *in1 = PG_GETARG_TEXT_PP(0);
                               1313             720 :     text       *in2 = PG_GETARG_TEXT_PP(1);
                               1314                 :     float4      res;
                               1315                 : 
                               1316            1440 :     res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
                               1317            1440 :                                VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
                               1318                 :                                WORD_SIMILARITY_STRICT);
                               1319                 : 
 2580                          1320             720 :     PG_FREE_IF_COPY(in1, 0);
                               1321             720 :     PG_FREE_IF_COPY(in2, 1);
                               1322             720 :     PG_RETURN_FLOAT4(1.0 - res);
                               1323                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a