TLA Line data Source code
1 : /*
2 : * contrib/pg_trgm/trgm_op.c
3 : */
4 : #include "postgres.h"
5 :
6 : #include <ctype.h>
7 :
8 : #include "catalog/pg_type.h"
9 : #include "lib/qunique.h"
10 : #include "miscadmin.h"
11 : #include "trgm.h"
12 : #include "tsearch/ts_locale.h"
13 : #include "utils/guc.h"
14 : #include "utils/lsyscache.h"
15 : #include "utils/memutils.h"
16 : #include "utils/pg_crc.h"
17 :
18 GIC 3 : PG_MODULE_MAGIC;
19 :
20 ECB : /* GUC variables */
21 : double similarity_threshold = 0.3f;
22 : double word_similarity_threshold = 0.6f;
23 : double strict_word_similarity_threshold = 0.5f;
24 :
25 CBC 2 : PG_FUNCTION_INFO_V1(set_limit);
26 2 : PG_FUNCTION_INFO_V1(show_limit);
27 2 : PG_FUNCTION_INFO_V1(show_trgm);
28 2 : PG_FUNCTION_INFO_V1(similarity);
29 2 : PG_FUNCTION_INFO_V1(word_similarity);
30 2 : PG_FUNCTION_INFO_V1(strict_word_similarity);
31 2 : PG_FUNCTION_INFO_V1(similarity_dist);
32 2 : PG_FUNCTION_INFO_V1(similarity_op);
33 2 : PG_FUNCTION_INFO_V1(word_similarity_op);
34 2 : PG_FUNCTION_INFO_V1(word_similarity_commutator_op);
35 1 : PG_FUNCTION_INFO_V1(word_similarity_dist_op);
36 2 : PG_FUNCTION_INFO_V1(word_similarity_dist_commutator_op);
37 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_op);
38 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_commutator_op);
39 1 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_op);
40 2 : PG_FUNCTION_INFO_V1(strict_word_similarity_dist_commutator_op);
41 :
42 : /* Trigram with position */
43 : typedef struct
44 : {
45 : trgm trg;
46 : int index;
47 : } pos_trgm;
48 :
49 : /* Trigram bound type */
50 : typedef uint8 TrgmBound;
51 : #define TRGM_BOUND_LEFT 0x01 /* trigram is left bound of word */
52 : #define TRGM_BOUND_RIGHT 0x02 /* trigram is right bound of word */
53 :
54 : /* Word similarity flags */
55 : #define WORD_SIMILARITY_CHECK_ONLY 0x01 /* only check existence of similar
56 : * search pattern in text */
57 : #define WORD_SIMILARITY_STRICT 0x02 /* force bounds of extent to match
58 : * word bounds */
59 :
60 : /*
61 : * Module load callback
62 : */
63 : void
64 3 : _PG_init(void)
65 : {
66 : /* Define custom GUC variables. */
67 3 : DefineCustomRealVariable("pg_trgm.similarity_threshold",
68 : "Sets the threshold used by the % operator.",
69 : "Valid range is 0.0 .. 1.0.",
70 : &similarity_threshold,
71 : 0.3f,
72 : 0.0,
73 : 1.0,
74 : PGC_USERSET,
75 : 0,
76 : NULL,
77 : NULL,
78 : NULL);
79 3 : DefineCustomRealVariable("pg_trgm.word_similarity_threshold",
80 : "Sets the threshold used by the <% operator.",
81 : "Valid range is 0.0 .. 1.0.",
82 : &word_similarity_threshold,
83 : 0.6f,
84 : 0.0,
85 : 1.0,
86 : PGC_USERSET,
87 : 0,
88 : NULL,
89 : NULL,
90 : NULL);
91 3 : DefineCustomRealVariable("pg_trgm.strict_word_similarity_threshold",
92 : "Sets the threshold used by the <<% operator.",
93 : "Valid range is 0.0 .. 1.0.",
94 : &strict_word_similarity_threshold,
95 : 0.5f,
96 : 0.0,
97 : 1.0,
98 : PGC_USERSET,
99 : 0,
100 : NULL,
101 : NULL,
102 : NULL);
103 :
104 3 : MarkGUCPrefixReserved("pg_trgm");
105 3 : }
106 :
107 : /*
108 : * Deprecated function.
109 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
110 : */
111 : Datum
112 2 : set_limit(PG_FUNCTION_ARGS)
113 : {
114 2 : float4 nlimit = PG_GETARG_FLOAT4(0);
115 : char *nlimit_str;
116 : Oid func_out_oid;
117 : bool is_varlena;
118 :
119 2 : getTypeOutputInfo(FLOAT4OID, &func_out_oid, &is_varlena);
120 :
121 2 : nlimit_str = OidOutputFunctionCall(func_out_oid, Float4GetDatum(nlimit));
122 :
123 2 : SetConfigOption("pg_trgm.similarity_threshold", nlimit_str,
124 : PGC_USERSET, PGC_S_SESSION);
125 :
126 2 : PG_RETURN_FLOAT4(similarity_threshold);
127 : }
128 :
129 :
130 : /*
131 : * Get similarity threshold for given index scan strategy number.
132 : */
133 : double
134 44078 : index_strategy_get_limit(StrategyNumber strategy)
135 : {
136 44078 : switch (strategy)
137 : {
138 33278 : case SimilarityStrategyNumber:
139 33278 : return similarity_threshold;
140 4822 : case WordSimilarityStrategyNumber:
141 4822 : return word_similarity_threshold;
142 5978 : case StrictWordSimilarityStrategyNumber:
143 5978 : return strict_word_similarity_threshold;
144 UBC 0 : default:
145 0 : elog(ERROR, "unrecognized strategy number: %d", strategy);
146 : break;
147 : }
148 :
149 : return 0.0; /* keep compiler quiet */
150 : }
151 :
152 : /*
153 : * Deprecated function.
154 : * Use "pg_trgm.similarity_threshold" GUC variable instead of this function.
155 : */
156 : Datum
157 CBC 20000 : show_limit(PG_FUNCTION_ARGS)
158 : {
159 20000 : PG_RETURN_FLOAT4(similarity_threshold);
160 : }
161 :
162 : static int
163 3132620 : comp_trgm(const void *a, const void *b)
164 : {
165 3132620 : return CMPTRGM(a, b);
166 : }
167 :
168 : /*
169 : * Finds first word in string, returns pointer to the word,
170 : * endword points to the character after word
171 : */
172 : static char *
173 239413 : find_word(char *str, int lenstr, char **endword, int *charlen)
174 : {
175 239413 : char *beginword = str;
176 :
177 253079 : while (beginword - str < lenstr && !ISWORDCHR(beginword))
178 13666 : beginword += pg_mblen(beginword);
179 :
180 239413 : if (beginword - str >= lenstr)
181 113080 : return NULL;
182 :
183 126333 : *endword = beginword;
184 126333 : *charlen = 0;
185 1087687 : while (*endword - str < lenstr && ISWORDCHR(*endword))
186 : {
187 961354 : *endword += pg_mblen(*endword);
188 961354 : (*charlen)++;
189 : }
190 :
191 126333 : return beginword;
192 : }
193 :
194 : /*
195 : * Reduce a trigram (three possibly multi-byte characters) to a trgm,
196 : * which is always exactly three bytes. If we have three single-byte
197 : * characters, we just use them as-is; otherwise we form a hash value.
198 : */
199 : void
200 1459 : compact_trigram(trgm *tptr, char *str, int bytelen)
201 : {
202 1459 : if (bytelen == 3)
203 : {
204 1459 : CPTRGM(tptr, str);
205 : }
206 : else
207 : {
208 : pg_crc32 crc;
209 :
210 UBC 0 : INIT_LEGACY_CRC32(crc);
211 0 : COMP_LEGACY_CRC32(crc, str, bytelen);
212 0 : FIN_LEGACY_CRC32(crc);
213 :
214 : /*
215 : * use only 3 upper bytes from crc, hope, it's good enough hashing
216 : */
217 0 : CPTRGM(tptr, &crc);
218 : }
219 CBC 1459 : }
220 :
221 : /*
222 : * Adds trigrams from words (already padded).
223 : */
224 : static trgm *
225 126397 : make_trigrams(trgm *tptr, char *str, int bytelen, int charlen)
226 : {
227 126397 : char *ptr = str;
228 :
229 126397 : if (charlen < 3)
230 27 : return tptr;
231 :
232 126370 : if (bytelen > charlen)
233 : {
234 : /* Find multibyte character boundaries and apply compact_trigram */
235 UBC 0 : int lenfirst = pg_mblen(str),
236 0 : lenmiddle = pg_mblen(str + lenfirst),
237 0 : lenlast = pg_mblen(str + lenfirst + lenmiddle);
238 :
239 0 : while ((ptr - str) + lenfirst + lenmiddle + lenlast <= bytelen)
240 : {
241 0 : compact_trigram(tptr, ptr, lenfirst + lenmiddle + lenlast);
242 :
243 0 : ptr += lenfirst;
244 0 : tptr++;
245 :
246 0 : lenfirst = lenmiddle;
247 0 : lenmiddle = lenlast;
248 0 : lenlast = pg_mblen(ptr + lenfirst + lenmiddle);
249 : }
250 : }
251 : else
252 : {
253 : /* Fast path when there are no multibyte characters */
254 CBC 126370 : Assert(bytelen == charlen);
255 :
256 1214148 : while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
257 : {
258 1087778 : CPTRGM(tptr, ptr);
259 1087778 : ptr++;
260 1087778 : tptr++;
261 : }
262 : }
263 :
264 126370 : return tptr;
265 : }
266 :
267 : /*
268 : * Make array of trigrams without sorting and removing duplicate items.
269 : *
270 : * trg: where to return the array of trigrams.
271 : * str: source string, of length slen bytes.
272 : * bounds: where to return bounds of trigrams (if needed).
273 : *
274 : * Returns length of the generated array.
275 : */
276 : static int
277 113081 : generate_trgm_only(trgm *trg, char *str, int slen, TrgmBound *bounds)
278 : {
279 : trgm *tptr;
280 : char *buf;
281 : int charlen,
282 : bytelen;
283 : char *bword,
284 : *eword;
285 :
286 113081 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
287 1 : return 0;
288 :
289 113080 : tptr = trg;
290 :
291 : /* Allocate a buffer for case-folded, blank-padded words */
292 113080 : buf = (char *) palloc(slen * pg_database_encoding_max_length() + 4);
293 :
294 : if (LPADDING > 0)
295 : {
296 113080 : *buf = ' ';
297 : if (LPADDING > 1)
298 113080 : *(buf + 1) = ' ';
299 : }
300 :
301 113080 : eword = str;
302 239413 : while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
303 : {
304 : #ifdef IGNORECASE
305 126333 : bword = lowerstr_with_len(bword, eword - bword);
306 126333 : bytelen = strlen(bword);
307 : #else
308 : bytelen = eword - bword;
309 : #endif
310 :
311 126333 : memcpy(buf + LPADDING, bword, bytelen);
312 :
313 : #ifdef IGNORECASE
314 126333 : pfree(bword);
315 : #endif
316 :
317 126333 : buf[LPADDING + bytelen] = ' ';
318 126333 : buf[LPADDING + bytelen + 1] = ' ';
319 :
320 : /* Calculate trigrams marking their bounds if needed */
321 126333 : if (bounds)
322 12400 : bounds[tptr - trg] |= TRGM_BOUND_LEFT;
323 126333 : tptr = make_trigrams(tptr, buf, bytelen + LPADDING + RPADDING,
324 : charlen + LPADDING + RPADDING);
325 126333 : if (bounds)
326 12400 : bounds[tptr - trg - 1] |= TRGM_BOUND_RIGHT;
327 : }
328 :
329 113080 : pfree(buf);
330 :
331 113080 : return tptr - trg;
332 : }
333 :
334 : /*
335 : * Guard against possible overflow in the palloc requests below. (We
336 : * don't worry about the additive constants, since palloc can detect
337 : * requests that are a little above MaxAllocSize --- we just need to
338 : * prevent integer overflow in the multiplications.)
339 : */
340 : static void
341 101010 : protect_out_of_mem(int slen)
342 : {
343 101010 : if ((Size) (slen / 2) >= (MaxAllocSize / (sizeof(trgm) * 3)) ||
344 101010 : (Size) slen >= (MaxAllocSize / pg_database_encoding_max_length()))
345 UBC 0 : ereport(ERROR,
346 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
347 : errmsg("out of memory")));
348 CBC 101010 : }
349 :
350 : /*
351 : * Make array of trigrams with sorting and removing duplicate items.
352 : *
353 : * str: source string, of length slen bytes.
354 : *
355 : * Returns the sorted array of unique trigrams.
356 : */
357 : TRGM *
358 88829 : generate_trgm(char *str, int slen)
359 : {
360 : TRGM *trg;
361 : int len;
362 :
363 88829 : protect_out_of_mem(slen);
364 :
365 88829 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
366 88829 : trg->flag = ARRKEY;
367 :
368 88829 : len = generate_trgm_only(GETARR(trg), str, slen, NULL);
369 88829 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
370 :
371 88829 : if (len == 0)
372 4 : return trg;
373 :
374 : /*
375 : * Make trigrams unique.
376 : */
377 88825 : if (len > 1)
378 : {
379 GNC 88825 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
380 CBC 88825 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
381 : }
382 :
383 88825 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
384 :
385 88825 : return trg;
386 : }
387 :
388 : /*
389 : * Make array of positional trigrams from two trigram arrays trg1 and trg2.
390 : *
391 : * trg1: trigram array of search pattern, of length len1. trg1 is required
392 : * word which positions don't matter and replaced with -1.
393 : * trg2: trigram array of text, of length len2. trg2 is haystack where we
394 : * search and have to store its positions.
395 : *
396 : * Returns concatenated trigram array.
397 : */
398 : static pos_trgm *
399 12126 : make_positional_trgm(trgm *trg1, int len1, trgm *trg2, int len2)
400 : {
401 : pos_trgm *result;
402 : int i,
403 12126 : len = len1 + len2;
404 :
405 12126 : result = (pos_trgm *) palloc(sizeof(pos_trgm) * len);
406 :
407 120864 : for (i = 0; i < len1; i++)
408 : {
409 108738 : memcpy(&result[i].trg, &trg1[i], sizeof(trgm));
410 108738 : result[i].index = -1;
411 : }
412 :
413 192225 : for (i = 0; i < len2; i++)
414 : {
415 180099 : memcpy(&result[i + len1].trg, &trg2[i], sizeof(trgm));
416 180099 : result[i + len1].index = i;
417 : }
418 :
419 12126 : return result;
420 : }
421 :
422 : /*
423 : * Compare position trigrams: compare trigrams first and position second.
424 : */
425 : static int
426 1307800 : comp_ptrgm(const void *v1, const void *v2)
427 : {
428 1307800 : const pos_trgm *p1 = (const pos_trgm *) v1;
429 1307800 : const pos_trgm *p2 = (const pos_trgm *) v2;
430 : int cmp;
431 :
432 1307800 : cmp = CMPTRGM(p1->trg, p2->trg);
433 1307800 : if (cmp != 0)
434 1268095 : return cmp;
435 :
436 39705 : if (p1->index < p2->index)
437 21365 : return -1;
438 18340 : else if (p1->index == p2->index)
439 UBC 0 : return 0;
440 : else
441 CBC 18340 : return 1;
442 : }
443 :
444 : /*
445 : * Iterative search function which calculates maximum similarity with word in
446 : * the string. Maximum similarity is only calculated only if the flag
447 : * WORD_SIMILARITY_CHECK_ONLY isn't set.
448 : *
449 : * trg2indexes: array which stores indexes of the array "found".
450 : * found: array which stores true of false values.
451 : * ulen1: count of unique trigrams of array "trg1".
452 : * len2: length of array "trg2" and array "trg2indexes".
453 : * len: length of the array "found".
454 : * flags: set of boolean flags parameterizing similarity calculation.
455 : * bounds: whether each trigram is left/right bound of word.
456 : *
457 : * Returns word similarity.
458 : */
459 : static float4
460 GIC 12126 : iterate_word_similarity(int *trg2indexes,
461 ECB : bool *found,
462 : int ulen1,
463 : int len2,
464 : int len,
465 : uint8 flags,
466 : TrgmBound *bounds)
467 : {
468 : int *lastpos,
469 : i,
470 GIC 12126 : ulen2 = 0,
471 CBC 12126 : count = 0,
472 12126 : upper = -1,
473 ECB : lower;
474 : float4 smlr_cur,
475 GIC 12126 : smlr_max = 0.0f;
476 ECB : double threshold;
477 :
478 GIC 12126 : Assert(bounds || !(flags & WORD_SIMILARITY_STRICT));
479 ECB :
480 : /* Select appropriate threshold */
481 GIC 24252 : threshold = (flags & WORD_SIMILARITY_STRICT) ?
482 CBC 12126 : strict_word_similarity_threshold :
483 ECB : word_similarity_threshold;
484 :
485 : /*
486 : * Consider first trigram as initial lower bound for strict word
487 : * similarity, or initialize it later with first trigram present for plain
488 : * word similarity.
489 : */
490 GIC 12126 : lower = (flags & WORD_SIMILARITY_STRICT) ? 0 : -1;
491 ECB :
492 : /* Memorise last position of each trigram */
493 GIC 12126 : lastpos = (int *) palloc(sizeof(int) * len);
494 CBC 12126 : memset(lastpos, -1, sizeof(int) * len);
495 ECB :
496 GIC 183655 : for (i = 0; i < len2; i++)
497 ECB : {
498 : int trgindex;
499 :
500 GNC 173313 : CHECK_FOR_INTERRUPTS();
501 :
502 : /* Get index of next trigram */
503 173313 : trgindex = trg2indexes[i];
504 :
505 ECB : /* Update last position of this trigram */
506 GIC 173313 : if (lower >= 0 || found[trgindex])
507 : {
508 CBC 135805 : if (lastpos[trgindex] < 0)
509 : {
510 GIC 133952 : ulen2++;
511 CBC 133952 : if (found[trgindex])
512 GIC 30756 : count++;
513 ECB : }
514 GIC 135805 : lastpos[trgindex] = i;
515 ECB : }
516 :
517 : /*
518 : * Adjust upper bound if trigram is upper bound of word for strict
519 : * word similarity, or if trigram is present in required substring for
520 : * plain word similarity
521 : */
522 GIC 250355 : if ((flags & WORD_SIMILARITY_STRICT) ? (bounds[i] & TRGM_BOUND_RIGHT)
523 77042 : : found[trgindex])
524 : {
525 : int prev_lower,
526 : tmp_ulen2,
527 ECB : tmp_lower,
528 : tmp_count;
529 :
530 GIC 25638 : upper = i;
531 25638 : if (lower == -1)
532 : {
533 4695 : lower = i;
534 4695 : ulen2 = 1;
535 ECB : }
536 :
537 GIC 25638 : smlr_cur = CALCSML(count, ulen1, ulen2);
538 ECB :
539 : /* Also try to adjust lower bound for greater similarity */
540 GIC 25638 : tmp_count = count;
541 25638 : tmp_ulen2 = ulen2;
542 CBC 25638 : prev_lower = lower;
543 GIC 208652 : for (tmp_lower = lower; tmp_lower <= upper; tmp_lower++)
544 : {
545 ECB : float smlr_tmp;
546 : int tmp_trgindex;
547 :
548 : /*
549 : * Adjust lower bound only if trigram is lower bound of word
550 : * for strict word similarity, or consider every trigram as
551 : * lower bound for plain word similarity.
552 : */
553 GIC 184798 : if (!(flags & WORD_SIMILARITY_STRICT)
554 145233 : || (bounds[tmp_lower] & TRGM_BOUND_LEFT))
555 : {
556 59704 : smlr_tmp = CALCSML(tmp_count, ulen1, tmp_ulen2);
557 59704 : if (smlr_tmp > smlr_cur)
558 ECB : {
559 CBC 3511 : smlr_cur = smlr_tmp;
560 GIC 3511 : ulen2 = tmp_ulen2;
561 CBC 3511 : lower = tmp_lower;
562 3511 : count = tmp_count;
563 : }
564 ECB :
565 : /*
566 : * If we only check that word similarity is greater than
567 : * threshold we do not need to calculate a maximum
568 : * similarity.
569 : */
570 GIC 59704 : if ((flags & WORD_SIMILARITY_CHECK_ONLY)
571 37114 : && smlr_cur >= threshold)
572 1784 : break;
573 : }
574 :
575 CBC 183014 : tmp_trgindex = trg2indexes[tmp_lower];
576 183014 : if (lastpos[tmp_trgindex] == tmp_lower)
577 ECB : {
578 GIC 180753 : tmp_ulen2--;
579 180753 : if (found[tmp_trgindex])
580 CBC 46591 : tmp_count--;
581 ECB : }
582 : }
583 :
584 CBC 25638 : smlr_max = Max(smlr_max, smlr_cur);
585 ECB :
586 : /*
587 : * if we only check that word similarity is greater than threshold
588 : * we do not need to calculate a maximum similarity.
589 : */
590 GIC 25638 : if ((flags & WORD_SIMILARITY_CHECK_ONLY) && smlr_max >= threshold)
591 1784 : break;
592 :
593 40602 : for (tmp_lower = prev_lower; tmp_lower < lower; tmp_lower++)
594 : {
595 ECB : int tmp_trgindex;
596 :
597 GIC 16748 : tmp_trgindex = trg2indexes[tmp_lower];
598 CBC 16748 : if (lastpos[tmp_trgindex] == tmp_lower)
599 GIC 16000 : lastpos[tmp_trgindex] = -1;
600 : }
601 : }
602 ECB : }
603 :
604 CBC 12126 : pfree(lastpos);
605 :
606 GIC 12126 : return smlr_max;
607 : }
608 :
609 ECB : /*
610 : * Calculate word similarity.
611 : * This function prepare two arrays: "trg2indexes" and "found". Then this arrays
612 : * are used to calculate word similarity using iterate_word_similarity().
613 : *
614 : * "trg2indexes" is array which stores indexes of the array "found".
615 : * In other words:
616 : * trg2indexes[j] = i;
617 : * found[i] = true (or false);
618 : * If found[i] == true then there is trigram trg2[j] in array "trg1".
619 : * If found[i] == false then there is not trigram trg2[j] in array "trg1".
620 : *
621 : * str1: search pattern string, of length slen1 bytes.
622 : * str2: text in which we are looking for a word, of length slen2 bytes.
623 : * flags: set of boolean flags parameterizing similarity calculation.
624 : *
625 : * Returns word similarity.
626 : */
627 : static float4
628 GIC 12126 : calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
629 : uint8 flags)
630 : {
631 : bool *found;
632 : pos_trgm *ptrg;
633 ECB : trgm *trg1;
634 : trgm *trg2;
635 : int len1,
636 : len2,
637 : len,
638 : i,
639 : j,
640 : ulen1;
641 : int *trg2indexes;
642 : float4 result;
643 : TrgmBound *bounds;
644 :
645 GIC 12126 : protect_out_of_mem(slen1 + slen2);
646 :
647 : /* Make positional trigrams */
648 12126 : trg1 = (trgm *) palloc(sizeof(trgm) * (slen1 / 2 + 1) * 3);
649 12126 : trg2 = (trgm *) palloc(sizeof(trgm) * (slen2 / 2 + 1) * 3);
650 CBC 12126 : if (flags & WORD_SIMILARITY_STRICT)
651 GIC 6662 : bounds = (TrgmBound *) palloc0(sizeof(TrgmBound) * (slen2 / 2 + 1) * 3);
652 : else
653 CBC 5464 : bounds = NULL;
654 ECB :
655 CBC 12126 : len1 = generate_trgm_only(trg1, str1, slen1, NULL);
656 12126 : len2 = generate_trgm_only(trg2, str2, slen2, bounds);
657 :
658 12126 : ptrg = make_positional_trgm(trg1, len1, trg2, len2);
659 GIC 12126 : len = len1 + len2;
660 CBC 12126 : qsort(ptrg, len, sizeof(pos_trgm), comp_ptrgm);
661 ECB :
662 GIC 12126 : pfree(trg1);
663 CBC 12126 : pfree(trg2);
664 ECB :
665 : /*
666 : * Merge positional trigrams array: enumerate each trigram and find its
667 : * presence in required word.
668 : */
669 GIC 12126 : trg2indexes = (int *) palloc(sizeof(int) * len2);
670 12126 : found = (bool *) palloc0(sizeof(bool) * len);
671 :
672 12126 : ulen1 = 0;
673 12126 : j = 0;
674 CBC 300963 : for (i = 0; i < len; i++)
675 ECB : {
676 GIC 288837 : if (i > 0)
677 ECB : {
678 CBC 276711 : int cmp = CMPTRGM(ptrg[i - 1].trg, ptrg[i].trg);
679 ECB :
680 GIC 276711 : if (cmp != 0)
681 ECB : {
682 GIC 242510 : if (found[j])
683 CBC 101138 : ulen1++;
684 GIC 242510 : j++;
685 ECB : }
686 : }
687 :
688 CBC 288837 : if (ptrg[i].index >= 0)
689 ECB : {
690 GIC 180099 : trg2indexes[ptrg[i].index] = j;
691 : }
692 : else
693 ECB : {
694 GIC 108738 : found[j] = true;
695 ECB : }
696 : }
697 GIC 12126 : if (found[j])
698 7600 : ulen1++;
699 ECB :
700 : /* Run iterative procedure to find maximum similarity with word */
701 GIC 12126 : result = iterate_word_similarity(trg2indexes, found, ulen1, len2, len,
702 ECB : flags, bounds);
703 :
704 GIC 12126 : pfree(trg2indexes);
705 12126 : pfree(found);
706 CBC 12126 : pfree(ptrg);
707 :
708 GIC 12126 : return result;
709 ECB : }
710 :
711 :
712 : /*
713 : * Extract the next non-wildcard part of a search string, i.e. a word bounded
714 : * by '_' or '%' meta-characters, non-word characters or string end.
715 : *
716 : * str: source string, of length lenstr bytes (need not be null-terminated)
717 : * buf: where to return the substring (must be long enough)
718 : * *bytelen: receives byte length of the found substring
719 : * *charlen: receives character length of the found substring
720 : *
721 : * Returns pointer to end+1 of the found substring in the source string.
722 : * Returns NULL if no word found (in which case buf, bytelen, charlen not set)
723 : *
724 : * If the found word is bounded by non-word characters or string boundaries
725 : * then this function will include corresponding padding spaces into buf.
726 : */
727 : static const char *
728 GIC 119 : get_wildcard_part(const char *str, int lenstr,
729 : char *buf, int *bytelen, int *charlen)
730 : {
731 119 : const char *beginword = str;
732 : const char *endword;
733 CBC 119 : char *s = buf;
734 GIC 119 : bool in_leading_wildcard_meta = false;
735 119 : bool in_trailing_wildcard_meta = false;
736 CBC 119 : bool in_escape = false;
737 : int clen;
738 ECB :
739 : /*
740 : * Find the first word character, remembering whether preceding character
741 : * was wildcard meta-character. Note that the in_escape state persists
742 : * from this loop to the next one, since we may exit at a word character
743 : * that is in_escape.
744 : */
745 GIC 241 : while (beginword - str < lenstr)
746 : {
747 186 : if (in_escape)
748 : {
749 3 : if (ISWORDCHR(beginword))
750 CBC 3 : break;
751 UIC 0 : in_escape = false;
752 LBC 0 : in_leading_wildcard_meta = false;
753 : }
754 ECB : else
755 : {
756 GBC 183 : if (ISESCAPECHAR(beginword))
757 3 : in_escape = true;
758 GIC 180 : else if (ISWILDCARDCHAR(beginword))
759 104 : in_leading_wildcard_meta = true;
760 76 : else if (ISWORDCHR(beginword))
761 CBC 61 : break;
762 ECB : else
763 CBC 15 : in_leading_wildcard_meta = false;
764 ECB : }
765 CBC 122 : beginword += pg_mblen(beginword);
766 ECB : }
767 :
768 : /*
769 : * Handle string end.
770 : */
771 GIC 119 : if (beginword - str >= lenstr)
772 55 : return NULL;
773 :
774 : /*
775 : * Add left padding spaces if preceding character wasn't wildcard
776 ECB : * meta-character.
777 : */
778 GIC 64 : *charlen = 0;
779 64 : if (!in_leading_wildcard_meta)
780 : {
781 : if (LPADDING > 0)
782 : {
783 CBC 15 : *s++ = ' ';
784 15 : (*charlen)++;
785 : if (LPADDING > 1)
786 : {
787 GIC 15 : *s++ = ' ';
788 CBC 15 : (*charlen)++;
789 ECB : }
790 : }
791 : }
792 :
793 : /*
794 : * Copy data into buf until wildcard meta-character, non-word character or
795 : * string boundary. Strip escapes during copy.
796 : */
797 GIC 64 : endword = beginword;
798 244 : while (endword - str < lenstr)
799 : {
800 244 : clen = pg_mblen(endword);
801 244 : if (in_escape)
802 ECB : {
803 CBC 3 : if (ISWORDCHR(endword))
804 : {
805 3 : memcpy(s, endword, clen);
806 3 : (*charlen)++;
807 GIC 3 : s += clen;
808 ECB : }
809 : else
810 : {
811 : /*
812 : * Back up endword to the escape character when stopping at an
813 : * escaped char, so that subsequent get_wildcard_part will
814 : * restart from the escape character. We assume here that
815 : * escape chars are single-byte.
816 : */
817 UIC 0 : endword--;
818 0 : break;
819 : }
820 GIC 3 : in_escape = false;
821 : }
822 EUB : else
823 : {
824 GIC 241 : if (ISESCAPECHAR(endword))
825 LBC 0 : in_escape = true;
826 GIC 241 : else if (ISWILDCARDCHAR(endword))
827 : {
828 55 : in_trailing_wildcard_meta = true;
829 CBC 55 : break;
830 EUB : }
831 CBC 186 : else if (ISWORDCHR(endword))
832 : {
833 177 : memcpy(s, endword, clen);
834 177 : (*charlen)++;
835 GIC 177 : s += clen;
836 ECB : }
837 : else
838 CBC 9 : break;
839 ECB : }
840 CBC 180 : endword += clen;
841 : }
842 :
843 ECB : /*
844 : * Add right padding spaces if next character isn't wildcard
845 : * meta-character.
846 : */
847 GIC 64 : if (!in_trailing_wildcard_meta)
848 : {
849 : if (RPADDING > 0)
850 : {
851 9 : *s++ = ' ';
852 CBC 9 : (*charlen)++;
853 : if (RPADDING > 1)
854 : {
855 : *s++ = ' ';
856 ECB : (*charlen)++;
857 : }
858 : }
859 : }
860 :
861 GIC 64 : *bytelen = s - buf;
862 64 : return endword;
863 : }
864 :
865 : /*
866 ECB : * Generates trigrams for wildcard search string.
867 : *
868 : * Returns array of trigrams that must occur in any string that matches the
869 : * wildcard string. For example, given pattern "a%bcd%" the trigrams
870 : * " a", "bcd" would be extracted.
871 : */
872 : TRGM *
873 GIC 55 : generate_wildcard_trgm(const char *str, int slen)
874 : {
875 : TRGM *trg;
876 : char *buf,
877 : *buf2;
878 ECB : trgm *tptr;
879 : int len,
880 : charlen,
881 : bytelen;
882 : const char *eword;
883 :
884 GIC 55 : protect_out_of_mem(slen);
885 :
886 55 : trg = (TRGM *) palloc(TRGMHDRSIZE + sizeof(trgm) * (slen / 2 + 1) * 3);
887 55 : trg->flag = ARRKEY;
888 55 : SET_VARSIZE(trg, TRGMHDRSIZE);
889 ECB :
890 GIC 55 : if (slen + LPADDING + RPADDING < 3 || slen == 0)
891 LBC 0 : return trg;
892 ECB :
893 CBC 55 : tptr = GETARR(trg);
894 :
895 ECB : /* Allocate a buffer for blank-padded, but not yet case-folded, words */
896 GBC 55 : buf = palloc(sizeof(char) * (slen + 4));
897 :
898 ECB : /*
899 : * Extract trigrams from each substring extracted by get_wildcard_part.
900 : */
901 CBC 55 : eword = str;
902 GIC 119 : while ((eword = get_wildcard_part(eword, slen - (eword - str),
903 119 : buf, &bytelen, &charlen)) != NULL)
904 : {
905 : #ifdef IGNORECASE
906 CBC 64 : buf2 = lowerstr_with_len(buf, bytelen);
907 64 : bytelen = strlen(buf2);
908 ECB : #else
909 : buf2 = buf;
910 : #endif
911 :
912 : /*
913 : * count trigrams
914 : */
915 GIC 64 : tptr = make_trigrams(tptr, buf2, bytelen, charlen);
916 :
917 : #ifdef IGNORECASE
918 64 : pfree(buf2);
919 : #endif
920 ECB : }
921 :
922 GIC 55 : pfree(buf);
923 ECB :
924 GIC 55 : if ((len = tptr - GETARR(trg)) == 0)
925 24 : return trg;
926 :
927 ECB : /*
928 : * Make trigrams unique.
929 : */
930 CBC 31 : if (len > 1)
931 : {
932 GNC 17 : qsort(GETARR(trg), len, sizeof(trgm), comp_trgm);
933 GIC 17 : len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
934 : }
935 ECB :
936 GIC 31 : SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
937 ECB :
938 CBC 31 : return trg;
939 : }
940 :
941 ECB : uint32
942 GIC 34773 : trgm2int(trgm *ptr)
943 ECB : {
944 GIC 34773 : uint32 val = 0;
945 :
946 34773 : val |= *(((unsigned char *) ptr));
947 CBC 34773 : val <<= 8;
948 GIC 34773 : val |= *(((unsigned char *) ptr) + 1);
949 CBC 34773 : val <<= 8;
950 GIC 34773 : val |= *(((unsigned char *) ptr) + 2);
951 ECB :
952 CBC 34773 : return val;
953 ECB : }
954 :
955 : Datum
956 GIC 7 : show_trgm(PG_FUNCTION_ARGS)
957 ECB : {
958 GIC 7 : text *in = PG_GETARG_TEXT_PP(0);
959 : TRGM *trg;
960 : Datum *d;
961 ECB : ArrayType *a;
962 : trgm *ptr;
963 : int i;
964 :
965 GIC 7 : trg = generate_trgm(VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in));
966 7 : d = (Datum *) palloc(sizeof(Datum) * (1 + ARRNELEM(trg)));
967 :
968 44 : for (i = 0, ptr = GETARR(trg); i < ARRNELEM(trg); i++, ptr++)
969 : {
970 CBC 37 : text *item = (text *) palloc(VARHDRSZ + Max(12, pg_database_encoding_max_length() * 3));
971 ECB :
972 GIC 37 : if (pg_database_encoding_max_length() > 1 && !ISPRINTABLETRGM(ptr))
973 ECB : {
974 UIC 0 : snprintf(VARDATA(item), 12, "0x%06x", trgm2int(ptr));
975 LBC 0 : SET_VARSIZE(item, VARHDRSZ + strlen(VARDATA(item)));
976 : }
977 ECB : else
978 : {
979 GBC 37 : SET_VARSIZE(item, VARHDRSZ + 3);
980 37 : CPTRGM(VARDATA(item), ptr);
981 : }
982 GIC 37 : d[i] = PointerGetDatum(item);
983 : }
984 ECB :
985 GNC 7 : a = construct_array_builtin(d, ARRNELEM(trg), TEXTOID);
986 :
987 CBC 44 : for (i = 0; i < ARRNELEM(trg); i++)
988 37 : pfree(DatumGetPointer(d[i]));
989 :
990 7 : pfree(d);
991 7 : pfree(trg);
992 7 : PG_FREE_IF_COPY(in, 0);
993 :
994 7 : PG_RETURN_POINTER(a);
995 : }
996 :
997 : float4
998 69791 : cnt_sml(TRGM *trg1, TRGM *trg2, bool inexact)
999 : {
1000 : trgm *ptr1,
1001 : *ptr2;
1002 69791 : int count = 0;
1003 : int len1,
1004 : len2;
1005 :
1006 69791 : ptr1 = GETARR(trg1);
1007 69791 : ptr2 = GETARR(trg2);
1008 :
1009 69791 : len1 = ARRNELEM(trg1);
1010 69791 : len2 = ARRNELEM(trg2);
1011 :
1012 : /* explicit test is needed to avoid 0/0 division when both lengths are 0 */
1013 69791 : if (len1 <= 0 || len2 <= 0)
1014 1 : return (float4) 0.0;
1015 :
1016 891582 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1017 : {
1018 821792 : int res = CMPTRGM(ptr1, ptr2);
1019 :
1020 821792 : if (res < 0)
1021 189653 : ptr1++;
1022 632139 : else if (res > 0)
1023 220022 : ptr2++;
1024 : else
1025 : {
1026 412117 : ptr1++;
1027 412117 : ptr2++;
1028 412117 : count++;
1029 : }
1030 : }
1031 :
1032 : /*
1033 : * If inexact then len2 is equal to count, because we don't know actual
1034 : * length of second string in inexact search and we can assume that count
1035 : * is a lower bound of len2.
1036 : */
1037 69790 : return CALCSML(count, len1, inexact ? count : len2);
1038 : }
1039 :
1040 :
1041 : /*
1042 : * Returns whether trg2 contains all trigrams in trg1.
1043 : * This relies on the trigram arrays being sorted.
1044 : */
1045 : bool
1046 190 : trgm_contained_by(TRGM *trg1, TRGM *trg2)
1047 : {
1048 : trgm *ptr1,
1049 : *ptr2;
1050 : int len1,
1051 : len2;
1052 :
1053 190 : ptr1 = GETARR(trg1);
1054 190 : ptr2 = GETARR(trg2);
1055 :
1056 190 : len1 = ARRNELEM(trg1);
1057 190 : len2 = ARRNELEM(trg2);
1058 :
1059 622 : while (ptr1 - GETARR(trg1) < len1 && ptr2 - GETARR(trg2) < len2)
1060 : {
1061 599 : int res = CMPTRGM(ptr1, ptr2);
1062 :
1063 599 : if (res < 0)
1064 167 : return false;
1065 432 : else if (res > 0)
1066 320 : ptr2++;
1067 : else
1068 : {
1069 112 : ptr1++;
1070 112 : ptr2++;
1071 : }
1072 : }
1073 23 : if (ptr1 - GETARR(trg1) < len1)
1074 4 : return false;
1075 : else
1076 19 : return true;
1077 : }
1078 :
1079 : /*
1080 : * Return a palloc'd boolean array showing, for each trigram in "query",
1081 : * whether it is present in the trigram array "key".
1082 : * This relies on the "key" array being sorted, but "query" need not be.
1083 : */
1084 : bool *
1085 2150 : trgm_presence_map(TRGM *query, TRGM *key)
1086 : {
1087 : bool *result;
1088 2150 : trgm *ptrq = GETARR(query),
1089 2150 : *ptrk = GETARR(key);
1090 2150 : int lenq = ARRNELEM(query),
1091 2150 : lenk = ARRNELEM(key),
1092 : i;
1093 :
1094 2150 : result = (bool *) palloc0(lenq * sizeof(bool));
1095 :
1096 : /* for each query trigram, do a binary search in the key array */
1097 507560 : for (i = 0; i < lenq; i++)
1098 : {
1099 505410 : int lo = 0;
1100 505410 : int hi = lenk;
1101 :
1102 2373653 : while (lo < hi)
1103 : {
1104 1876282 : int mid = (lo + hi) / 2;
1105 1876282 : int res = CMPTRGM(ptrq, ptrk + mid);
1106 :
1107 1876282 : if (res < 0)
1108 784082 : hi = mid;
1109 1092200 : else if (res > 0)
1110 1084161 : lo = mid + 1;
1111 : else
1112 : {
1113 8039 : result[i] = true;
1114 8039 : break;
1115 : }
1116 : }
1117 505410 : ptrq++;
1118 : }
1119 :
1120 2150 : return result;
1121 : }
1122 :
1123 : Datum
1124 31452 : similarity(PG_FUNCTION_ARGS)
1125 : {
1126 31452 : text *in1 = PG_GETARG_TEXT_PP(0);
1127 31452 : text *in2 = PG_GETARG_TEXT_PP(1);
1128 : TRGM *trg1,
1129 : *trg2;
1130 : float4 res;
1131 :
1132 31452 : trg1 = generate_trgm(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1));
1133 31452 : trg2 = generate_trgm(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2));
1134 :
1135 31452 : res = cnt_sml(trg1, trg2, false);
1136 :
1137 31452 : pfree(trg1);
1138 31452 : pfree(trg2);
1139 31452 : PG_FREE_IF_COPY(in1, 0);
1140 31452 : PG_FREE_IF_COPY(in2, 1);
1141 :
1142 31452 : PG_RETURN_FLOAT4(res);
1143 : }
1144 :
1145 : Datum
1146 902 : word_similarity(PG_FUNCTION_ARGS)
1147 : {
1148 902 : text *in1 = PG_GETARG_TEXT_PP(0);
1149 902 : text *in2 = PG_GETARG_TEXT_PP(1);
1150 : float4 res;
1151 :
1152 1804 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1153 1804 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1154 : 0);
1155 :
1156 902 : PG_FREE_IF_COPY(in1, 0);
1157 902 : PG_FREE_IF_COPY(in2, 1);
1158 902 : PG_RETURN_FLOAT4(res);
1159 : }
1160 :
1161 : Datum
1162 882 : strict_word_similarity(PG_FUNCTION_ARGS)
1163 : {
1164 882 : text *in1 = PG_GETARG_TEXT_PP(0);
1165 882 : text *in2 = PG_GETARG_TEXT_PP(1);
1166 : float4 res;
1167 :
1168 1764 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1169 1764 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1170 : WORD_SIMILARITY_STRICT);
1171 :
1172 882 : PG_FREE_IF_COPY(in1, 0);
1173 882 : PG_FREE_IF_COPY(in2, 1);
1174 882 : PG_RETURN_FLOAT4(res);
1175 : }
1176 :
1177 : Datum
1178 1004 : similarity_dist(PG_FUNCTION_ARGS)
1179 : {
1180 1004 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1181 : PG_GETARG_DATUM(0),
1182 : PG_GETARG_DATUM(1)));
1183 :
1184 1004 : PG_RETURN_FLOAT4(1.0 - res);
1185 : }
1186 :
1187 : Datum
1188 6000 : similarity_op(PG_FUNCTION_ARGS)
1189 : {
1190 6000 : float4 res = DatumGetFloat4(DirectFunctionCall2(similarity,
1191 : PG_GETARG_DATUM(0),
1192 : PG_GETARG_DATUM(1)));
1193 :
1194 6000 : PG_RETURN_BOOL(res >= similarity_threshold);
1195 : }
1196 :
1197 : Datum
1198 1924 : word_similarity_op(PG_FUNCTION_ARGS)
1199 : {
1200 1924 : text *in1 = PG_GETARG_TEXT_PP(0);
1201 1924 : text *in2 = PG_GETARG_TEXT_PP(1);
1202 : float4 res;
1203 :
1204 3848 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1205 3848 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1206 : WORD_SIMILARITY_CHECK_ONLY);
1207 :
1208 1924 : PG_FREE_IF_COPY(in1, 0);
1209 1924 : PG_FREE_IF_COPY(in2, 1);
1210 1924 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1211 : }
1212 :
1213 : Datum
1214 1924 : word_similarity_commutator_op(PG_FUNCTION_ARGS)
1215 : {
1216 1924 : text *in1 = PG_GETARG_TEXT_PP(0);
1217 1924 : text *in2 = PG_GETARG_TEXT_PP(1);
1218 : float4 res;
1219 :
1220 3848 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1221 3848 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1222 : WORD_SIMILARITY_CHECK_ONLY);
1223 :
1224 1924 : PG_FREE_IF_COPY(in1, 0);
1225 1924 : PG_FREE_IF_COPY(in2, 1);
1226 1924 : PG_RETURN_BOOL(res >= word_similarity_threshold);
1227 : }
1228 :
1229 : Datum
1230 UBC 0 : word_similarity_dist_op(PG_FUNCTION_ARGS)
1231 : {
1232 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1233 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1234 : float4 res;
1235 :
1236 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1237 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1238 : 0);
1239 :
1240 0 : PG_FREE_IF_COPY(in1, 0);
1241 0 : PG_FREE_IF_COPY(in2, 1);
1242 0 : PG_RETURN_FLOAT4(1.0 - res);
1243 : }
1244 :
1245 : Datum
1246 CBC 714 : word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1247 : {
1248 714 : text *in1 = PG_GETARG_TEXT_PP(0);
1249 714 : text *in2 = PG_GETARG_TEXT_PP(1);
1250 : float4 res;
1251 :
1252 1428 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1253 1428 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1254 : 0);
1255 :
1256 714 : PG_FREE_IF_COPY(in1, 0);
1257 714 : PG_FREE_IF_COPY(in2, 1);
1258 714 : PG_RETURN_FLOAT4(1.0 - res);
1259 : }
1260 :
1261 : Datum
1262 2530 : strict_word_similarity_op(PG_FUNCTION_ARGS)
1263 : {
1264 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1265 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1266 : float4 res;
1267 :
1268 5060 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1269 5060 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1270 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1271 :
1272 2530 : PG_FREE_IF_COPY(in1, 0);
1273 2530 : PG_FREE_IF_COPY(in2, 1);
1274 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1275 : }
1276 :
1277 : Datum
1278 2530 : strict_word_similarity_commutator_op(PG_FUNCTION_ARGS)
1279 : {
1280 2530 : text *in1 = PG_GETARG_TEXT_PP(0);
1281 2530 : text *in2 = PG_GETARG_TEXT_PP(1);
1282 : float4 res;
1283 :
1284 5060 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1285 5060 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1286 : WORD_SIMILARITY_CHECK_ONLY | WORD_SIMILARITY_STRICT);
1287 :
1288 2530 : PG_FREE_IF_COPY(in1, 0);
1289 2530 : PG_FREE_IF_COPY(in2, 1);
1290 2530 : PG_RETURN_BOOL(res >= strict_word_similarity_threshold);
1291 : }
1292 :
1293 : Datum
1294 UBC 0 : strict_word_similarity_dist_op(PG_FUNCTION_ARGS)
1295 : {
1296 0 : text *in1 = PG_GETARG_TEXT_PP(0);
1297 0 : text *in2 = PG_GETARG_TEXT_PP(1);
1298 : float4 res;
1299 :
1300 0 : res = calc_word_similarity(VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1301 0 : VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1302 : WORD_SIMILARITY_STRICT);
1303 :
1304 0 : PG_FREE_IF_COPY(in1, 0);
1305 0 : PG_FREE_IF_COPY(in2, 1);
1306 0 : PG_RETURN_FLOAT4(1.0 - res);
1307 : }
1308 :
1309 : Datum
1310 CBC 720 : strict_word_similarity_dist_commutator_op(PG_FUNCTION_ARGS)
1311 : {
1312 720 : text *in1 = PG_GETARG_TEXT_PP(0);
1313 720 : text *in2 = PG_GETARG_TEXT_PP(1);
1314 : float4 res;
1315 :
1316 1440 : res = calc_word_similarity(VARDATA_ANY(in2), VARSIZE_ANY_EXHDR(in2),
1317 1440 : VARDATA_ANY(in1), VARSIZE_ANY_EXHDR(in1),
1318 : WORD_SIMILARITY_STRICT);
1319 :
1320 720 : PG_FREE_IF_COPY(in1, 0);
1321 720 : PG_FREE_IF_COPY(in2, 1);
1322 720 : PG_RETURN_FLOAT4(1.0 - res);
1323 : }
|