Age Owner TLA Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_type.h"
9 : #include "lib/qunique.h"
10 : #include "miscadmin.h"
11 : #include "trgm.h"
12 : #include "tsearch/ts_locale.h"
13 : #include "utils/guc.h"
14 : #include "utils/lsyscache.h"
15 : #include "utils/memutils.h"
16 : #include "utils/pg_crc.h"
17 :
6158 tgl 18 GIC 3 : PG_MODULE_MAGIC;
19 :
2580 teodor 20 ECB : /* GUC variables */
21 : double similarity_threshold = 0.3f;
22 : double word_similarity_threshold = 0.6f;
23 : double strict_word_similarity_threshold = 0.5f;
24 :
6887 teodor 25 CBC 2 : PG_FUNCTION_INFO_V1(set_limit);
4451 tgl 26 2 : PG_FUNCTION_INFO_V1(show_limit);
27 2 : PG_FUNCTION_INFO_V1(show_trgm);
28 2 : PG_FUNCTION_INFO_V1(similarity);
2580 teodor 29 2 : PG_FUNCTION_INFO_V1(word_similarity);
1845 30 2 : PG_FUNCTION_INFO_V1(strict_word_similarity);
4451 tgl 31 2 : PG_FUNCTION_INFO_V1(similarity_dist);
32 2 : PG_FUNCTION_INFO_V1(similarity_op);
2580 teodor 33 2 : PG_FUNCTION_INFO_V1(word_similarity_op);
34 2 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
35 1 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
36 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
1845 37 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
38 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
39 1 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
40 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
41 :
42 : /* Trigram with position */
43 : typedef struct
44 : {
45 : trgm trg;
46 : int index;
47 : } pos_trgm;
48 :
49 : /* Trigram bound type */
50 : typedef uint8 TrgmBound;
51 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
52 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
53 :
54 : /* Word similarity flags */
55 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
56 : * search pattern in text */
57 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
58 : * word bounds */
59 :
60 : /*
61 : * Module load callback
62 : */
63 : void
2580 64 3 : _PG_init(void)
65 : {
66 : /* Define custom GUC variables. */
67 3 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
68 : "Sets the threshold used by the % operator.",
69 : "Valid range is 0.0 .. 1.0.",
70 : &similarity_threshold,
71 : 0.3f,
72 : 0.0,
73 : 1.0,
74 : PGC_USERSET,
75 : 0,
76 : NULL,
77 : NULL,
78 : NULL);
79 3 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80 : "Sets the threshold used by the <% operator.",
81 : "Valid range is 0.0 .. 1.0.",
82 : &word_similarity_threshold,
83 : 0.6f,
84 : 0.0,
85 : 1.0,
86 : PGC_USERSET,
87 : 0,
88 : NULL,
89 : NULL,
90 : NULL);
1845 91 3 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92 : "Sets the threshold used by the <<% operator.",
93 : "Valid range is 0.0 .. 1.0.",
94 : &strict_word_similarity_threshold,
95 : 0.5f,
96 : 0.0,
97 : 1.0,
98 : PGC_USERSET,
99 : 0,
100 : NULL,
101 : NULL,
102 : NULL);
103 :
412 tgl 104 3 : MarkGUCPrefixReserved("pg_trgm");
2580 teodor 105 3 : }
106 :
107 : /*
108 : * Deprecated function.
109 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
110 : */
111 : Datum
6797 bruce 112 2 : set_limit(PG_FUNCTION_ARGS)
113 : {
114 2 : float4 nlimit = PG_GETARG_FLOAT4(0);
115 : char *nlimit_str;
116 : Oid func_out_oid;
117 : bool is_varlena;
118 :
2578 teodor 119 2 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
120 :
121 2 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
122 :
123 2 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
124 : PGC_USERSET, PGC_S_SESSION);
125 :
2580 126 2 : PG_RETURN_FLOAT4(similarity_threshold);
127 : }
128 :
129 :
130 : /*
131 : * Get similarity threshold for given index scan strategy number.
132 : */
133 : double
1845 134 44078 : index_strategy_get_limit(StrategyNumber strategy)
135 : {
136 44078 : switch (strategy)
137 : {
138 33278 : case SimilarityStrategyNumber:
139 33278 : return similarity_threshold;
140 4822 : case WordSimilarityStrategyNumber:
141 4822 : return word_similarity_threshold;
142 5978 : case StrictWordSimilarityStrategyNumber:
143 5978 : return strict_word_similarity_threshold;
1845 teodor 144 UBC 0 : default:
145 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
146 : break;
147 : }
148 :
149 : return 0.0; /* keep compiler quiet */
150 : }
151 :
152 : /*
153 : * Deprecated function.
154 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
155 : */
156 : Datum
6797 bruce 157 CBC 20000 : show_limit(PG_FUNCTION_ARGS)
158 : {
2580 teodor 159 20000 : PG_RETURN_FLOAT4(similarity_threshold);
160 : }
161 :
162 : static int
6797 bruce 163 3132620 : comp_trgm(const void *a, const void *b)
164 : {
165 3132620 : return CMPTRGM(a, b);
166 : }
167 :
168 : /*
169 : * Finds first word in string, returns pointer to the word,
170 : * endword points to the character after word
171 : */
172 : static char *
5050 173 239413 : find_word(char *str, int lenstr, char **endword, int *charlen)
174 : {
175 239413 : char *beginword = str;
176 :
3652 tgl 177 253079 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
5261 teodor 178 13666 : beginword += pg_mblen(beginword);
179 :
180 239413 : if (beginword - str >= lenstr)
181 113080 : return NULL;
182 :
183 126333 : *endword = beginword;
184 126333 : *charlen = 0;
3652 tgl 185 1087687 : while (*endword - str < lenstr && ISWORDCHR(*endword))
186 : {
5261 teodor 187 961354 : *endword += pg_mblen(*endword);
188 961354 : (*charlen)++;
189 : }
190 :
191 126333 : return beginword;
192 : }
193 :
194 : /*
195 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
196 : * which is always exactly three bytes. If we have three single-byte
197 : * characters, we just use them as-is; otherwise we form a hash value.
198 : */
199 : void
3654 tgl 200 1459 : compact_trigram(trgm *tptr, char *str, int bytelen)
201 : {
5050 bruce 202 1459 : if (bytelen == 3)
203 : {
204 1459 : CPTRGM(tptr, str);
205 : }
206 : else
207 : {
208 : pg_crc32 crc;
209 :
3078 heikki.linnakangas 210 UBC 0 : INIT_LEGACY_CRC32(crc);
211 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
212 0 : FIN_LEGACY_CRC32(crc);
213 :
214 : /*
215 : * use only 3 upper bytes from crc, hope, it's good enough hashing
216 : */
5261 teodor 217 0 : CPTRGM(tptr, &crc);
218 : }
5261 teodor 219 CBC 1459 : }
220 :
221 : /*
222 : * Adds trigrams from words (already padded).
223 : */
224 : static trgm *
5050 bruce 225 126397 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
226 : {
227 126397 : char *ptr = str;
228 :
229 126397 : if (charlen < 3)
5261 teodor 230 27 : return tptr;
231 :
3654 tgl 232 126370 : if (bytelen > charlen)
233 : {
234 : /* Find multibyte character boundaries and apply compact_trigram */
5050 bruce 235 UBC 0 : int lenfirst = pg_mblen(str),
236 0 : lenmiddle = pg_mblen(str + lenfirst),
237 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
238 :
239 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
240 : {
3654 tgl 241 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
242 :
5261 teodor 243 0 : ptr += lenfirst;
244 0 : tptr++;
245 :
5050 bruce 246 0 : lenfirst = lenmiddle;
247 0 : lenmiddle = lenlast;
248 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
249 : }
250 : }
251 : else
252 : {
253 : /* Fast path when there are no multibyte characters */
5050 bruce 254 CBC 126370 : Assert(bytelen == charlen);
255 :
5261 teodor 256 1214148 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
257 : {
258 1087778 : CPTRGM(tptr, ptr);
259 1087778 : ptr++;
260 1087778 : tptr++;
261 : }
262 : }
263 :
264 126370 : return tptr;
265 : }
266 :
267 : /*
268 : * Make array of trigrams without sorting and removing duplicate items.
269 : *
270 : * trg: where to return the array of trigrams.
271 : * str: source string, of length slen bytes.
272 : * bounds: where to return bounds of trigrams (if needed).
273 : *
274 : * Returns length of the generated array.
275 : */
276 : static int
1845 277 113081 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
278 : {
279 : trgm *tptr;
280 : char *buf;
281 : int charlen,
282 : bytelen;
283 : char *bword,
284 : *eword;
285 :
6797 bruce 286 113081 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
2580 teodor 287 1 : return 0;
288 :
289 113080 : tptr = trg;
290 :
291 : /* Allocate a buffer for case-folded, blank-padded words */
3373 tgl 292 113080 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
293 :
294 : if (LPADDING > 0)
295 : {
6887 teodor 296 113080 : *buf = ' ';
297 : if (LPADDING > 1)
6797 bruce 298 113080 : *(buf + 1) = ' ';
299 : }
300 :
5261 teodor 301 113080 : eword = str;
5050 bruce 302 239413 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
303 : {
304 : #ifdef IGNORECASE
5261 teodor 305 126333 : bword = lowerstr_with_len(bword, eword - bword);
306 126333 : bytelen = strlen(bword);
307 : #else
308 : bytelen = eword - bword;
309 : #endif
310 :
311 126333 : memcpy(buf + LPADDING, bword, bytelen);
312 :
313 : #ifdef IGNORECASE
314 126333 : pfree(bword);
315 : #endif
316 :
5050 bruce 317 126333 : buf[LPADDING + bytelen] = ' ';
318 126333 : buf[LPADDING + bytelen + 1] = ' ';
319 :
320 : /* Calculate trigrams marking their bounds if needed */
1845 teodor 321 126333 : if (bounds)
322 12400 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
5050 bruce 323 126333 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
324 : charlen + LPADDING + RPADDING);
1845 teodor 325 126333 : if (bounds)
326 12400 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
327 : }
328 :
6887 329 113080 : pfree(buf);
330 :
2580 331 113080 : return tptr - trg;
332 : }
333 :
334 : /*
335 : * Guard against possible overflow in the palloc requests below. (We
336 : * don't worry about the additive constants, since palloc can detect
337 : * requests that are a little above MaxAllocSize --- we just need to
338 : * prevent integer overflow in the multiplications.)
339 : */
340 : static void
341 101010 : protect_out_of_mem(int slen)
342 : {
343 101010 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
344 101010 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
2580 teodor 345 UBC 0 : ereport(ERROR,
346 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
347 : errmsg("out of memory")));
2580 teodor 348 CBC 101010 : }
349 :
350 : /*
351 : * Make array of trigrams with sorting and removing duplicate items.
352 : *
353 : * str: source string, of length slen bytes.
354 : *
355 : * Returns the sorted array of unique trigrams.
356 : */
357 : TRGM *
358 88829 : generate_trgm(char *str, int slen)
359 : {
360 : TRGM *trg;
361 : int len;
362 :
363 88829 : protect_out_of_mem(slen);
364 :
2118 tgl 365 88829 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
2580 teodor 366 88829 : trg->flag = ARRKEY;
367 :
1845 368 88829 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
2580 369 88829 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
370 :
371 88829 : if (len == 0)
6887 372 4 : return trg;
373 :
374 : /*
375 : * Make trigrams unique.
376 : */
3373 tgl 377 88825 : if (len > 1)
378 : {
61 peter 379 GNC 88825 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
1249 tmunro 380 CBC 88825 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
381 : }
382 :
5884 tgl 383 88825 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
384 :
6887 teodor 385 88825 : return trg;
386 : }
387 :
388 : /*
389 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
390 : *
391 : * trg1: trigram array of search pattern, of length len1. trg1 is required
392 : * word which positions don't matter and replaced with -1.
393 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
394 : * search and have to store its positions.
395 : *
396 : * Returns concatenated trigram array.
397 : */
398 : static pos_trgm *
2580 399 12126 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
400 : {
401 : pos_trgm *result;
402 : int i,
2495 rhaas 403 12126 : len = len1 + len2;
404 :
2580 teodor 405 12126 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
406 :
407 120864 : for (i = 0; i < len1; i++)
408 : {
409 108738 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
410 108738 : result[i].index = -1;
411 : }
412 :
413 192225 : for (i = 0; i < len2; i++)
414 : {
415 180099 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
416 180099 : result[i + len1].index = i;
417 : }
418 :
419 12126 : return result;
420 : }
421 :
422 : /*
423 : * Compare position trigrams: compare trigrams first and position second.
424 : */
425 : static int
426 1307800 : comp_ptrgm(const void *v1, const void *v2)
427 : {
2495 rhaas 428 1307800 : const pos_trgm *p1 = (const pos_trgm *) v1;
429 1307800 : const pos_trgm *p2 = (const pos_trgm *) v2;
430 : int cmp;
431 :
2580 teodor 432 1307800 : cmp = CMPTRGM(p1->trg, p2->trg);
433 1307800 : if (cmp != 0)
434 1268095 : return cmp;
435 :
436 39705 : if (p1->index < p2->index)
437 21365 : return -1;
438 18340 : else if (p1->index == p2->index)
2580 teodor 439 UBC 0 : return 0;
440 : else
2580 teodor 441 CBC 18340 : return 1;
442 : }
443 :
444 : /*
445 : * Iterative search function which calculates maximum similarity with word in
446 : * the string. Maximum similarity is only calculated only if the flag
447 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
448 : *
449 : * trg2indexes: array which stores indexes of the array "found".
450 : * found: array which stores true of false values.
451 : * ulen1: count of unique trigrams of array "trg1".
452 : * len2: length of array "trg2" and array "trg2indexes".
453 : * len: length of the array "found".
454 : * flags: set of boolean flags parameterizing similarity calculation.
455 : * bounds: whether each trigram is left/right bound of word.
456 : *
457 : * Returns word similarity.
458 : */
459 : static float4
2580 teodor 460 GIC 12126 : iterate_word_similarity(int *trg2indexes,
2580 teodor 461 ECB : bool *found,
462 : int ulen1,
463 : int len2,
464 : int len,
465 : uint8 flags,
466 : TrgmBound *bounds)
467 : {
468 : int *lastpos,
469 : i,
2580 teodor 470 GIC 12126 : ulen2 = 0,
2580 teodor 471 CBC 12126 : count = 0,
472 12126 : upper = -1,
1845 teodor 473 ECB : lower;
474 : float4 smlr_cur,
2580 teodor 475 GIC 12126 : smlr_max = 0.0f;
1845 teodor 476 ECB : double threshold;
477 :
1845 teodor 478 GIC 12126 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
1845 teodor 479 ECB :
480 : /* Select appropriate threshold */
1845 teodor 481 GIC 24252 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
1809 tgl 482 CBC 12126 : strict_word_similarity_threshold :
1809 tgl 483 ECB : word_similarity_threshold;
484 :
485 : /*
486 : * Consider first trigram as initial lower bound for strict word
487 : * similarity, or initialize it later with first trigram present for plain
488 : * word similarity.
489 : */
1845 teodor 490 GIC 12126 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
2580 teodor 491 ECB :
492 : /* Memorise last position of each trigram */
2580 teodor 493 GIC 12126 : lastpos = (int *) palloc(sizeof(int) * len);
2580 teodor 494 CBC 12126 : memset(lastpos, -1, sizeof(int) * len);
2580 teodor 495 ECB :
2580 teodor 496 GIC 183655 : for (i = 0; i < len2; i++)
2580 teodor 497 ECB : {
498 : int trgindex;
499 :
216 dgustafsson 500 GNC 173313 : CHECK_FOR_INTERRUPTS();
501 :
502 : /* Get index of next trigram */
503 173313 : trgindex = trg2indexes[i];
504 :
2580 teodor 505 ECB : /* Update last position of this trigram */
2580 teodor 506 GIC 173313 : if (lower >= 0 || found[trgindex])
507 : {
2580 teodor 508 CBC 135805 : if (lastpos[trgindex] < 0)
509 : {
2580 teodor 510 GIC 133952 : ulen2++;
2580 teodor 511 CBC 133952 : if (found[trgindex])
2580 teodor 512 GIC 30756 : count++;
2580 teodor 513 ECB : }
2580 teodor 514 GIC 135805 : lastpos[trgindex] = i;
2580 teodor 515 ECB : }
516 :
1845 517 : /*
518 : * Adjust upper bound if trigram is upper bound of word for strict
519 : * word similarity, or if trigram is present in required substring for
520 : * plain word similarity
521 : */
1845 teodor 522 GIC 250355 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
1809 tgl 523 77042 : : found[trgindex])
524 : {
525 : int prev_lower,
526 : tmp_ulen2,
2495 rhaas 527 ECB : tmp_lower,
528 : tmp_count;
529 :
2580 teodor 530 GIC 25638 : upper = i;
531 25638 : if (lower == -1)
532 : {
533 4695 : lower = i;
534 4695 : ulen2 = 1;
2580 teodor 535 ECB : }
536 :
2580 teodor 537 GIC 25638 : smlr_cur = CALCSML(count, ulen1, ulen2);
2580 teodor 538 ECB :
1845 539 : /* Also try to adjust lower bound for greater similarity */
2580 teodor 540 GIC 25638 : tmp_count = count;
541 25638 : tmp_ulen2 = ulen2;
2580 teodor 542 CBC 25638 : prev_lower = lower;
2580 teodor 543 GIC 208652 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
544 : {
1845 teodor 545 ECB : float smlr_tmp;
2495 rhaas 546 : int tmp_trgindex;
2580 teodor 547 :
548 : /*
549 : * Adjust lower bound only if trigram is lower bound of word
550 : * for strict word similarity, or consider every trigram as
551 : * lower bound for plain word similarity.
552 : */
1845 teodor 553 GIC 184798 : if (!(flags & WORD_SIMILARITY_STRICT)
554 145233 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
555 : {
556 59704 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
557 59704 : if (smlr_tmp > smlr_cur)
1845 teodor 558 ECB : {
1845 teodor 559 CBC 3511 : smlr_cur = smlr_tmp;
1845 teodor 560 GIC 3511 : ulen2 = tmp_ulen2;
1845 teodor 561 CBC 3511 : lower = tmp_lower;
562 3511 : count = tmp_count;
563 : }
1845 teodor 564 ECB :
565 : /*
566 : * If we only check that word similarity is greater than
567 : * threshold we do not need to calculate a maximum
568 : * similarity.
569 : */
1845 teodor 570 GIC 59704 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
571 37114 : && smlr_cur >= threshold)
572 1784 : break;
573 : }
574 :
2580 teodor 575 CBC 183014 : tmp_trgindex = trg2indexes[tmp_lower];
576 183014 : if (lastpos[tmp_trgindex] == tmp_lower)
2580 teodor 577 ECB : {
2580 teodor 578 GIC 180753 : tmp_ulen2--;
579 180753 : if (found[tmp_trgindex])
2580 teodor 580 CBC 46591 : tmp_count--;
2580 teodor 581 ECB : }
582 : }
583 :
2580 teodor 584 CBC 25638 : smlr_max = Max(smlr_max, smlr_cur);
2495 rhaas 585 ECB :
586 : /*
587 : * if we only check that word similarity is greater than threshold
588 : * we do not need to calculate a maximum similarity.
2580 teodor 589 : */
1845 teodor 590 GIC 25638 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
2580 591 1784 : break;
592 :
593 40602 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
594 : {
2495 rhaas 595 ECB : int tmp_trgindex;
596 :
2580 teodor 597 GIC 16748 : tmp_trgindex = trg2indexes[tmp_lower];
2580 teodor 598 CBC 16748 : if (lastpos[tmp_trgindex] == tmp_lower)
2580 teodor 599 GIC 16000 : lastpos[tmp_trgindex] = -1;
600 : }
601 : }
2580 teodor 602 ECB : }
603 :
2580 teodor 604 CBC 12126 : pfree(lastpos);
605 :
2580 teodor 606 GIC 12126 : return smlr_max;
607 : }
608 :
2580 teodor 609 ECB : /*
610 : * Calculate word similarity.
611 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
612 : * are used to calculate word similarity using iterate_word_similarity().
613 : *
614 : * "trg2indexes" is array which stores indexes of the array "found".
615 : * In other words:
616 : * trg2indexes[j] = i;
617 : * found[i] = true (or false);
618 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
619 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
620 : *
621 : * str1: search pattern string, of length slen1 bytes.
622 : * str2: text in which we are looking for a word, of length slen2 bytes.
623 : * flags: set of boolean flags parameterizing similarity calculation.
624 : *
625 : * Returns word similarity.
626 : */
627 : static float4
2580 teodor 628 GIC 12126 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
629 : uint8 flags)
630 : {
631 : bool *found;
632 : pos_trgm *ptrg;
2580 teodor 633 ECB : trgm *trg1;
634 : trgm *trg2;
635 : int len1,
636 : len2,
637 : len,
638 : i,
639 : j,
640 : ulen1;
641 : int *trg2indexes;
642 : float4 result;
643 : TrgmBound *bounds;
644 :
2580 teodor 645 GIC 12126 : protect_out_of_mem(slen1 + slen2);
646 :
647 : /* Make positional trigrams */
2118 tgl 648 12126 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
649 12126 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
1845 teodor 650 CBC 12126 : if (flags & WORD_SIMILARITY_STRICT)
1845 teodor 651 GIC 6662 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
652 : else
1845 teodor 653 CBC 5464 : bounds = NULL;
2580 teodor 654 ECB :
1845 teodor 655 CBC 12126 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
656 12126 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
657 :
2580 658 12126 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
2580 teodor 659 GIC 12126 : len = len1 + len2;
2580 teodor 660 CBC 12126 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
2580 teodor 661 ECB :
2580 teodor 662 GIC 12126 : pfree(trg1);
2580 teodor 663 CBC 12126 : pfree(trg2);
2580 teodor 664 ECB :
665 : /*
666 : * Merge positional trigrams array: enumerate each trigram and find its
667 : * presence in required word.
668 : */
2580 teodor 669 GIC 12126 : trg2indexes = (int *) palloc(sizeof(int) * len2);
670 12126 : found = (bool *) palloc0(sizeof(bool) * len);
671 :
672 12126 : ulen1 = 0;
673 12126 : j = 0;
2580 teodor 674 CBC 300963 : for (i = 0; i < len; i++)
2580 teodor 675 ECB : {
2580 teodor 676 GIC 288837 : if (i > 0)
2580 teodor 677 ECB : {
2495 rhaas 678 CBC 276711 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
2495 rhaas 679 ECB :
2580 teodor 680 GIC 276711 : if (cmp != 0)
2580 teodor 681 ECB : {
2580 teodor 682 GIC 242510 : if (found[j])
2580 teodor 683 CBC 101138 : ulen1++;
2580 teodor 684 GIC 242510 : j++;
2580 teodor 685 ECB : }
686 : }
687 :
2580 teodor 688 CBC 288837 : if (ptrg[i].index >= 0)
2580 teodor 689 ECB : {
2580 teodor 690 GIC 180099 : trg2indexes[ptrg[i].index] = j;
691 : }
692 : else
2580 teodor 693 ECB : {
2580 teodor 694 GIC 108738 : found[j] = true;
2580 teodor 695 ECB : }
696 : }
2580 teodor 697 GIC 12126 : if (found[j])
698 7600 : ulen1++;
2580 teodor 699 ECB :
700 : /* Run iterative procedure to find maximum similarity with word */
2580 teodor 701 GIC 12126 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
1845 teodor 702 ECB : flags, bounds);
2580 703 :
2580 teodor 704 GIC 12126 : pfree(trg2indexes);
705 12126 : pfree(found);
2580 teodor 706 CBC 12126 : pfree(ptrg);
707 :
2580 teodor 708 GIC 12126 : return result;
2580 teodor 709 ECB : }
710 :
711 :
712 : /*
2578 rhaas 713 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
714 : * by '_' or '%' meta-characters, non-word characters or string end.
715 : *
716 : * str: source string, of length lenstr bytes (need not be null-terminated)
717 : * buf: where to return the substring (must be long enough)
718 : * *bytelen: receives byte length of the found substring
719 : * *charlen: receives character length of the found substring
720 : *
721 : * Returns pointer to end+1 of the found substring in the source string.
722 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
723 : *
724 : * If the found word is bounded by non-word characters or string boundaries
725 : * then this function will include corresponding padding spaces into buf.
726 : */
727 : static const char *
4451 tgl 728 GIC 119 : get_wildcard_part(const char *str, int lenstr,
729 : char *buf, int *bytelen, int *charlen)
730 : {
731 119 : const char *beginword = str;
732 : const char *endword;
4451 tgl 733 CBC 119 : char *s = buf;
3884 tgl 734 GIC 119 : bool in_leading_wildcard_meta = false;
735 119 : bool in_trailing_wildcard_meta = false;
4382 bruce 736 CBC 119 : bool in_escape = false;
737 : int clen;
4451 tgl 738 ECB :
739 : /*
3884 740 : * Find the first word character, remembering whether preceding character
741 : * was wildcard meta-character. Note that the in_escape state persists
742 : * from this loop to the next one, since we may exit at a word character
743 : * that is in_escape.
744 : */
4451 tgl 745 GIC 241 : while (beginword - str < lenstr)
746 : {
747 186 : if (in_escape)
748 : {
3652 749 3 : if (ISWORDCHR(beginword))
4451 tgl 750 CBC 3 : break;
3884 tgl 751 UIC 0 : in_escape = false;
3884 tgl 752 LBC 0 : in_leading_wildcard_meta = false;
753 : }
4451 tgl 754 ECB : else
755 : {
4451 tgl 756 GBC 183 : if (ISESCAPECHAR(beginword))
757 3 : in_escape = true;
4451 tgl 758 GIC 180 : else if (ISWILDCARDCHAR(beginword))
3884 759 104 : in_leading_wildcard_meta = true;
3652 760 76 : else if (ISWORDCHR(beginword))
4451 tgl 761 CBC 61 : break;
4451 tgl 762 ECB : else
3884 tgl 763 CBC 15 : in_leading_wildcard_meta = false;
4451 tgl 764 ECB : }
4451 tgl 765 CBC 122 : beginword += pg_mblen(beginword);
4451 tgl 766 ECB : }
767 :
768 : /*
769 : * Handle string end.
770 : */
4451 tgl 771 GIC 119 : if (beginword - str >= lenstr)
772 55 : return NULL;
773 :
774 : /*
775 : * Add left padding spaces if preceding character wasn't wildcard
4451 tgl 776 ECB : * meta-character.
777 : */
4451 tgl 778 GIC 64 : *charlen = 0;
3884 779 64 : if (!in_leading_wildcard_meta)
780 : {
781 : if (LPADDING > 0)
782 : {
4451 tgl 783 CBC 15 : *s++ = ' ';
784 15 : (*charlen)++;
785 : if (LPADDING > 1)
786 : {
4451 tgl 787 GIC 15 : *s++ = ' ';
4451 tgl 788 CBC 15 : (*charlen)++;
4451 tgl 789 ECB : }
790 : }
791 : }
792 :
793 : /*
794 : * Copy data into buf until wildcard meta-character, non-word character or
795 : * string boundary. Strip escapes during copy.
796 : */
4451 tgl 797 GIC 64 : endword = beginword;
798 244 : while (endword - str < lenstr)
799 : {
800 244 : clen = pg_mblen(endword);
801 244 : if (in_escape)
4451 tgl 802 ECB : {
3652 tgl 803 CBC 3 : if (ISWORDCHR(endword))
804 : {
4451 805 3 : memcpy(s, endword, clen);
806 3 : (*charlen)++;
4451 tgl 807 GIC 3 : s += clen;
4451 tgl 808 ECB : }
809 : else
3884 810 : {
811 : /*
3602 bruce 812 : * Back up endword to the escape character when stopping at an
813 : * escaped char, so that subsequent get_wildcard_part will
814 : * restart from the escape character. We assume here that
815 : * escape chars are single-byte.
816 : */
3884 tgl 817 UIC 0 : endword--;
4451 818 0 : break;
819 : }
3884 tgl 820 GIC 3 : in_escape = false;
821 : }
4451 tgl 822 EUB : else
823 : {
4451 tgl 824 GIC 241 : if (ISESCAPECHAR(endword))
4451 tgl 825 LBC 0 : in_escape = true;
4451 tgl 826 GIC 241 : else if (ISWILDCARDCHAR(endword))
827 : {
3884 828 55 : in_trailing_wildcard_meta = true;
4451 tgl 829 CBC 55 : break;
4451 tgl 830 EUB : }
3652 tgl 831 CBC 186 : else if (ISWORDCHR(endword))
832 : {
4451 833 177 : memcpy(s, endword, clen);
834 177 : (*charlen)++;
4451 tgl 835 GIC 177 : s += clen;
4451 tgl 836 ECB : }
837 : else
4451 tgl 838 CBC 9 : break;
4451 tgl 839 ECB : }
4451 tgl 840 CBC 180 : endword += clen;
841 : }
842 :
4451 tgl 843 ECB : /*
844 : * Add right padding spaces if next character isn't wildcard
845 : * meta-character.
846 : */
3884 tgl 847 GIC 64 : if (!in_trailing_wildcard_meta)
848 : {
849 : if (RPADDING > 0)
850 : {
4451 851 9 : *s++ = ' ';
4451 tgl 852 CBC 9 : (*charlen)++;
853 : if (RPADDING > 1)
854 : {
855 : *s++ = ' ';
4451 tgl 856 ECB : (*charlen)++;
857 : }
858 : }
859 : }
860 :
4451 tgl 861 GIC 64 : *bytelen = s - buf;
862 64 : return endword;
863 : }
864 :
865 : /*
4451 tgl 866 ECB : * Generates trigrams for wildcard search string.
867 : *
868 : * Returns array of trigrams that must occur in any string that matches the
869 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
870 : * " a", "bcd" would be extracted.
871 : */
872 : TRGM *
4451 tgl 873 GIC 55 : generate_wildcard_trgm(const char *str, int slen)
874 : {
875 : TRGM *trg;
876 : char *buf,
877 : *buf2;
4451 tgl 878 ECB : trgm *tptr;
879 : int len,
880 : charlen,
881 : bytelen;
882 : const char *eword;
883 :
2580 teodor 884 GIC 55 : protect_out_of_mem(slen);
885 :
2118 tgl 886 55 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
4451 887 55 : trg->flag = ARRKEY;
888 55 : SET_VARSIZE(trg, TRGMHDRSIZE);
4451 tgl 889 ECB :
4451 tgl 890 GIC 55 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
4451 tgl 891 LBC 0 : return trg;
4451 tgl 892 ECB :
4451 tgl 893 CBC 55 : tptr = GETARR(trg);
894 :
3373 tgl 895 ECB : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
4451 tgl 896 GBC 55 : buf = palloc(sizeof(char) * (slen + 4));
897 :
4451 tgl 898 ECB : /*
899 : * Extract trigrams from each substring extracted by get_wildcard_part.
900 : */
4451 tgl 901 CBC 55 : eword = str;
4451 tgl 902 GIC 119 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
903 119 : buf, &bytelen, &charlen)) != NULL)
904 : {
905 : #ifdef IGNORECASE
4451 tgl 906 CBC 64 : buf2 = lowerstr_with_len(buf, bytelen);
907 64 : bytelen = strlen(buf2);
4451 tgl 908 ECB : #else
909 : buf2 = buf;
910 : #endif
911 :
912 : /*
913 : * count trigrams
914 : */
4451 tgl 915 GIC 64 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
916 :
917 : #ifdef IGNORECASE
918 64 : pfree(buf2);
919 : #endif
4451 tgl 920 ECB : }
921 :
4451 tgl 922 GIC 55 : pfree(buf);
4451 tgl 923 ECB :
4451 tgl 924 GIC 55 : if ((len = tptr - GETARR(trg)) == 0)
925 24 : return trg;
926 :
4451 tgl 927 ECB : /*
928 : * Make trigrams unique.
929 : */
3373 tgl 930 CBC 31 : if (len > 1)
931 : {
61 peter 932 GNC 17 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
1249 tmunro 933 GIC 17 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
934 : }
4451 tgl 935 ECB :
4451 tgl 936 GIC 31 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
4451 tgl 937 ECB :
4451 tgl 938 CBC 31 : return trg;
939 : }
940 :
5261 teodor 941 ECB : uint32
5261 teodor 942 GIC 34773 : trgm2int(trgm *ptr)
5261 teodor 943 ECB : {
5050 bruce 944 GIC 34773 : uint32 val = 0;
945 :
946 34773 : val |= *(((unsigned char *) ptr));
5261 teodor 947 CBC 34773 : val <<= 8;
5050 bruce 948 GIC 34773 : val |= *(((unsigned char *) ptr) + 1);
5261 teodor 949 CBC 34773 : val <<= 8;
5050 bruce 950 GIC 34773 : val |= *(((unsigned char *) ptr) + 2);
5261 teodor 951 ECB :
5261 teodor 952 CBC 34773 : return val;
5261 teodor 953 ECB : }
6887 954 :
955 : Datum
6797 bruce 956 GIC 7 : show_trgm(PG_FUNCTION_ARGS)
6797 bruce 957 ECB : {
2219 noah 958 GIC 7 : text *in = PG_GETARG_TEXT_PP(0);
959 : TRGM *trg;
960 : Datum *d;
6797 bruce 961 ECB : ArrayType *a;
962 : trgm *ptr;
5750 tgl 963 : int i;
964 :
2219 noah 965 GIC 7 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
6797 bruce 966 7 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
967 :
5750 tgl 968 44 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
969 : {
5050 bruce 970 CBC 37 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
6797 bruce 971 ECB :
5050 bruce 972 GIC 37 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
5261 teodor 973 ECB : {
5261 teodor 974 UIC 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
5261 teodor 975 LBC 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
976 : }
5261 teodor 977 ECB : else
978 : {
5261 teodor 979 GBC 37 : SET_VARSIZE(item, VARHDRSZ + 3);
980 37 : CPTRGM(VARDATA(item), ptr);
981 : }
5750 tgl 982 GIC 37 : d[i] = PointerGetDatum(item);
983 : }
6887 teodor 984 ECB :
282 peter 985 GNC 7 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
986 :
5750 tgl 987 CBC 44 : for (i = 0; i < ARRNELEM(trg); i++)
988 37 : pfree(DatumGetPointer(d[i]));
989 :
6887 teodor 990 7 : pfree(d);
991 7 : pfree(trg);
6797 bruce 992 7 : PG_FREE_IF_COPY(in, 0);
993 :
6887 teodor 994 7 : PG_RETURN_POINTER(a);
995 : }
996 :
997 : float4
2580 998 69791 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
999 : {
1000 : trgm *ptr1,
1001 : *ptr2;
6797 bruce 1002 69791 : int count = 0;
1003 : int len1,
1004 : len2;
1005 :
6887 teodor 1006 69791 : ptr1 = GETARR(trg1);
1007 69791 : ptr2 = GETARR(trg2);
1008 :
1009 69791 : len1 = ARRNELEM(trg1);
1010 69791 : len2 = ARRNELEM(trg2);
1011 :
1012 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
3707 tgl 1013 69791 : if (len1 <= 0 || len2 <= 0)
1014 1 : return (float4) 0.0;
1015 :
6797 bruce 1016 891582 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1017 : {
1018 821792 : int res = CMPTRGM(ptr1, ptr2);
1019 :
1020 821792 : if (res < 0)
6887 teodor 1021 189653 : ptr1++;
6797 bruce 1022 632139 : else if (res > 0)
6887 teodor 1023 220022 : ptr2++;
1024 : else
1025 : {
1026 412117 : ptr1++;
1027 412117 : ptr2++;
1028 412117 : count++;
1029 : }
1030 : }
1031 :
1032 : /*
1033 : * If inexact then len2 is equal to count, because we don't know actual
1034 : * length of second string in inexact search and we can assume that count
1035 : * is a lower bound of len2.
1036 : */
2580 1037 69790 : return CALCSML(count, len1, inexact ? count : len2);
1038 : }
1039 :
1040 :
1041 : /*
1042 : * Returns whether trg2 contains all trigrams in trg1.
1043 : * This relies on the trigram arrays being sorted.
1044 : */
1045 : bool
4451 tgl 1046 190 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1047 : {
1048 : trgm *ptr1,
1049 : *ptr2;
1050 : int len1,
1051 : len2;
1052 :
1053 190 : ptr1 = GETARR(trg1);
1054 190 : ptr2 = GETARR(trg2);
1055 :
1056 190 : len1 = ARRNELEM(trg1);
1057 190 : len2 = ARRNELEM(trg2);
1058 :
1059 622 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1060 : {
1061 599 : int res = CMPTRGM(ptr1, ptr2);
1062 :
1063 599 : if (res < 0)
1064 167 : return false;
1065 432 : else if (res > 0)
1066 320 : ptr2++;
1067 : else
1068 : {
1069 112 : ptr1++;
1070 112 : ptr2++;
1071 : }
1072 : }
1073 23 : if (ptr1 - GETARR(trg1) < len1)
1074 4 : return false;
1075 : else
1076 19 : return true;
1077 : }
1078 :
1079 : /*
1080 : * Return a palloc'd boolean array showing, for each trigram in "query",
1081 : * whether it is present in the trigram array "key".
1082 : * This relies on the "key" array being sorted, but "query" need not be.
1083 : */
1084 : bool *
3651 1085 2150 : trgm_presence_map(TRGM *query, TRGM *key)
1086 : {
1087 : bool *result;
1088 2150 : trgm *ptrq = GETARR(query),
1089 2150 : *ptrk = GETARR(key);
1090 2150 : int lenq = ARRNELEM(query),
1091 2150 : lenk = ARRNELEM(key),
1092 : i;
1093 :
1094 2150 : result = (bool *) palloc0(lenq * sizeof(bool));
1095 :
1096 : /* for each query trigram, do a binary search in the key array */
1097 507560 : for (i = 0; i < lenq; i++)
1098 : {
1099 505410 : int lo = 0;
1100 505410 : int hi = lenk;
1101 :
1102 2373653 : while (lo < hi)
1103 : {
1104 1876282 : int mid = (lo + hi) / 2;
1105 1876282 : int res = CMPTRGM(ptrq, ptrk + mid);
1106 :
1107 1876282 : if (res < 0)
1108 784082 : hi = mid;
1109 1092200 : else if (res > 0)
1110 1084161 : lo = mid + 1;
1111 : else
1112 : {
1113 8039 : result[i] = true;
1114 8039 : break;
1115 : }
1116 : }
1117 505410 : ptrq++;
1118 : }
1119 :
1120 2150 : return result;
1121 : }
1122 :
1123 : Datum
6797 bruce 1124 31452 : similarity(PG_FUNCTION_ARGS)
1125 : {
2219 noah 1126 31452 : text *in1 = PG_GETARG_TEXT_PP(0);
1127 31452 : text *in2 = PG_GETARG_TEXT_PP(1);
1128 : TRGM *trg1,
1129 : *trg2;
1130 : float4 res;
1131 :
1132 31452 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1133 31452 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1134 :
2580 teodor 1135 31452 : res = cnt_sml(trg1, trg2, false);
1136 :
6887 1137 31452 : pfree(trg1);
1138 31452 : pfree(trg2);
6797 bruce 1139 31452 : PG_FREE_IF_COPY(in1, 0);
1140 31452 : PG_FREE_IF_COPY(in2, 1);
1141 :
6887 teodor 1142 31452 : PG_RETURN_FLOAT4(res);
1143 : }
1144 :
1145 : Datum
2580 1146 902 : word_similarity(PG_FUNCTION_ARGS)
1147 : {
1148 902 : text *in1 = PG_GETARG_TEXT_PP(0);
1149 902 : text *in2 = PG_GETARG_TEXT_PP(1);
1150 : float4 res;
1151 :
1152 1804 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
2495 rhaas 1153 1804 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1154 : 0);
1155 :
1845 teodor 1156 902 : PG_FREE_IF_COPY(in1, 0);
1157 902 : PG_FREE_IF_COPY(in2, 1);
1158 902 : PG_RETURN_FLOAT4(res);
1159 : }
1160 :
1161 : Datum
1162 882 : strict_word_similarity(PG_FUNCTION_ARGS)
1163 : {
1164 882 : text *in1 = PG_GETARG_TEXT_PP(0);
1165 882 : text *in2 = PG_GETARG_TEXT_PP(1);
1166 : float4 res;
1167 :
1168 1764 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1169 1764 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1170 : WORD_SIMILARITY_STRICT);
1171 :
2580 1172 882 : PG_FREE_IF_COPY(in1, 0);
1173 882 : PG_FREE_IF_COPY(in2, 1);
1174 882 : PG_RETURN_FLOAT4(res);
1175 : }
1176 :
1177 : Datum
4509 tgl 1178 1004 : similarity_dist(PG_FUNCTION_ARGS)
1179 : {
1180 1004 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1181 : PG_GETARG_DATUM(0),
1182 : PG_GETARG_DATUM(1)));
1183 :
1184 1004 : PG_RETURN_FLOAT4(1.0 - res);
1185 : }
1186 :
1187 : Datum
6797 bruce 1188 6000 : similarity_op(PG_FUNCTION_ARGS)
1189 : {
4509 tgl 1190 6000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1191 : PG_GETARG_DATUM(0),
1192 : PG_GETARG_DATUM(1)));
1193 :
2580 teodor 1194 6000 : PG_RETURN_BOOL(res >= similarity_threshold);
1195 : }
1196 :
1197 : Datum
1198 1924 : word_similarity_op(PG_FUNCTION_ARGS)
1199 : {
1200 1924 : text *in1 = PG_GETARG_TEXT_PP(0);
1201 1924 : text *in2 = PG_GETARG_TEXT_PP(1);
1202 : float4 res;
1203 :
1204 3848 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
2495 rhaas 1205 3848 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1206 : WORD_SIMILARITY_CHECK_ONLY);
1207 :
2580 teodor 1208 1924 : PG_FREE_IF_COPY(in1, 0);
1209 1924 : PG_FREE_IF_COPY(in2, 1);
1210 1924 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1211 : }
1212 :
1213 : Datum
1214 1924 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1215 : {
1216 1924 : text *in1 = PG_GETARG_TEXT_PP(0);
1217 1924 : text *in2 = PG_GETARG_TEXT_PP(1);
1218 : float4 res;
1219 :
1220 3848 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
2495 rhaas 1221 3848 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1222 : WORD_SIMILARITY_CHECK_ONLY);
1223 :
2580 teodor 1224 1924 : PG_FREE_IF_COPY(in1, 0);
1225 1924 : PG_FREE_IF_COPY(in2, 1);
1226 1924 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1227 : }
1228 :
1229 : Datum
2580 teodor 1230 UBC 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1231 : {
1232 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1233 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1234 : float4 res;
1235 :
1236 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
2495 rhaas 1237 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1238 : 0);
1239 :
2580 teodor 1240 0 : PG_FREE_IF_COPY(in1, 0);
1241 0 : PG_FREE_IF_COPY(in2, 1);
1242 0 : PG_RETURN_FLOAT4(1.0 - res);
1243 : }
1244 :
1245 : Datum
2580 teodor 1246 CBC 714 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1247 : {
1248 714 : text *in1 = PG_GETARG_TEXT_PP(0);
1249 714 : text *in2 = PG_GETARG_TEXT_PP(1);
1250 : float4 res;
1251 :
1252 1428 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
2495 rhaas 1253 1428 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1254 : 0);
1255 :
1845 teodor 1256 714 : PG_FREE_IF_COPY(in1, 0);
1257 714 : PG_FREE_IF_COPY(in2, 1);
1258 714 : PG_RETURN_FLOAT4(1.0 - res);
1259 : }
1260 :
1261 : Datum
1262 2530 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1263 : {
1264 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1265 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1266 : float4 res;
1267 :
1268 5060 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1269 5060 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1270 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1271 :
1272 2530 : PG_FREE_IF_COPY(in1, 0);
1273 2530 : PG_FREE_IF_COPY(in2, 1);
1274 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1275 : }
1276 :
1277 : Datum
1278 2530 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1279 : {
1280 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1281 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1282 : float4 res;
1283 :
1284 5060 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1285 5060 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1286 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1287 :
1288 2530 : PG_FREE_IF_COPY(in1, 0);
1289 2530 : PG_FREE_IF_COPY(in2, 1);
1290 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1291 : }
1292 :
1293 : Datum
1845 teodor 1294 UBC 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1295 : {
1296 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1297 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1298 : float4 res;
1299 :
1300 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1301 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1302 : WORD_SIMILARITY_STRICT);
1303 :
1304 0 : PG_FREE_IF_COPY(in1, 0);
1305 0 : PG_FREE_IF_COPY(in2, 1);
1306 0 : PG_RETURN_FLOAT4(1.0 - res);
1307 : }
1308 :
1309 : Datum
1845 teodor 1310 CBC 720 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1311 : {
1312 720 : text *in1 = PG_GETARG_TEXT_PP(0);
1313 720 : text *in2 = PG_GETARG_TEXT_PP(1);
1314 : float4 res;
1315 :
1316 1440 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1317 1440 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1318 : WORD_SIMILARITY_STRICT);
1319 :
2580 1320 720 : PG_FREE_IF_COPY(in1, 0);
1321 720 : PG_FREE_IF_COPY(in2, 1);
1322 720 : PG_RETURN_FLOAT4(1.0 - res);
1323 : }
|