Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_op.c
4 : * operations over tsvector
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_op.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include <limits.h>
17 :
18 : #include "access/htup_details.h"
19 : #include "catalog/namespace.h"
20 : #include "catalog/pg_type.h"
21 : #include "commands/trigger.h"
22 : #include "executor/spi.h"
23 : #include "funcapi.h"
24 : #include "lib/qunique.h"
25 : #include "mb/pg_wchar.h"
26 : #include "miscadmin.h"
27 : #include "parser/parse_coerce.h"
28 : #include "tsearch/ts_utils.h"
29 : #include "utils/array.h"
30 : #include "utils/builtins.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/regproc.h"
33 : #include "utils/rel.h"
34 :
35 :
36 : typedef struct
37 : {
38 : WordEntry *arrb;
39 : WordEntry *arre;
40 : char *values;
41 : char *operand;
42 : } CHKVAL;
43 :
44 :
45 : typedef struct StatEntry
46 : {
47 : uint32 ndoc; /* zero indicates that we were already here
48 : * while walking through the tree */
49 : uint32 nentry;
50 : struct StatEntry *left;
51 : struct StatEntry *right;
52 : uint32 lenlexeme;
53 : char lexeme[FLEXIBLE_ARRAY_MEMBER];
54 : } StatEntry;
55 :
56 : #define STATENTRYHDRSZ (offsetof(StatEntry, lexeme))
57 :
58 : typedef struct
59 : {
60 : int32 weight;
61 :
62 : uint32 maxdepth;
63 :
64 : StatEntry **stack;
65 : uint32 stackpos;
66 :
67 : StatEntry *root;
68 : } TSVectorStat;
69 :
70 :
71 : static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
72 : uint32 flags,
73 : TSExecuteCallback chkcond);
74 : static bool TS_execute_locations_recurse(QueryItem *curitem,
75 : void *arg,
76 : TSExecuteCallback chkcond,
77 : List **locations);
78 : static int tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len);
79 : static Datum tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column);
80 :
81 :
82 : /*
83 : * Order: haspos, len, word, for all positions (pos, weight)
84 : */
85 : static int
5710 tgl 86 GIC 1 : silly_cmp_tsvector(const TSVector a, const TSVector b)
87 : {
88 1 : if (VARSIZE(a) < VARSIZE(b))
5710 tgl 89 UIC 0 : return -1;
5710 tgl 90 CBC 1 : else if (VARSIZE(a) > VARSIZE(b))
5710 tgl 91 UIC 0 : return 1;
5710 tgl 92 CBC 1 : else if (a->size < b->size)
5710 tgl 93 UBC 0 : return -1;
5710 tgl 94 CBC 1 : else if (a->size > b->size)
5710 tgl 95 UBC 0 : return 1;
5710 tgl 96 ECB : else
5710 tgl 97 EUB : {
5710 tgl 98 CBC 1 : WordEntry *aptr = ARRPTR(a);
5710 tgl 99 GBC 1 : WordEntry *bptr = ARRPTR(b);
5710 tgl 100 GIC 1 : int i = 0;
101 : int res;
5710 tgl 102 ECB :
103 :
5710 tgl 104 CBC 4 : for (i = 0; i < a->size; i++)
105 : {
5710 tgl 106 GIC 3 : if (aptr->haspos != bptr->haspos)
107 : {
5710 tgl 108 LBC 0 : return (aptr->haspos > bptr->haspos) ? -1 : 1;
109 : }
5050 bruce 110 CBC 3 : else if ((res = tsCompareString(STRPTR(a) + aptr->pos, aptr->len, STRPTR(b) + bptr->pos, bptr->len, false)) != 0)
111 : {
5710 tgl 112 UBC 0 : return res;
113 : }
5710 tgl 114 CBC 3 : else if (aptr->haspos)
115 : {
5710 tgl 116 UBC 0 : WordEntryPos *ap = POSDATAPTR(a, aptr);
5710 tgl 117 UIC 0 : WordEntryPos *bp = POSDATAPTR(b, bptr);
5710 tgl 118 ECB : int j;
119 :
5710 tgl 120 UBC 0 : if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
121 0 : return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
122 :
5710 tgl 123 UIC 0 : for (j = 0; j < POSDATALEN(a, aptr); j++)
5710 tgl 124 EUB : {
5710 tgl 125 UBC 0 : if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
126 : {
127 0 : return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
128 : }
129 0 : else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
130 : {
131 0 : return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
132 : }
133 0 : ap++, bp++;
134 : }
5710 tgl 135 EUB : }
136 :
5710 tgl 137 GBC 3 : aptr++;
5710 tgl 138 GIC 3 : bptr++;
139 : }
140 : }
5710 tgl 141 ECB :
5710 tgl 142 CBC 1 : return 0;
143 : }
144 :
145 : #define TSVECTORCMPFUNC( type, action, ret ) \
5710 tgl 146 ECB : Datum \
147 : tsvector_##type(PG_FUNCTION_ARGS) \
148 : { \
149 : TSVector a = PG_GETARG_TSVECTOR(0); \
150 : TSVector b = PG_GETARG_TSVECTOR(1); \
151 : int res = silly_cmp_tsvector(a, b); \
152 : PG_FREE_IF_COPY(a,0); \
153 : PG_FREE_IF_COPY(b,1); \
154 : PG_RETURN_##ret( res action 0 ); \
155 : } \
156 : /* keep compiler quiet - no extra ; */ \
157 : extern int no_such_variable
158 :
5710 tgl 159 UIC 0 : TSVECTORCMPFUNC(lt, <, BOOL);
160 0 : TSVECTORCMPFUNC(le, <=, BOOL);
5710 tgl 161 GIC 1 : TSVECTORCMPFUNC(eq, ==, BOOL);
5710 tgl 162 UIC 0 : TSVECTORCMPFUNC(ge, >=, BOOL);
5710 tgl 163 UBC 0 : TSVECTORCMPFUNC(gt, >, BOOL);
164 0 : TSVECTORCMPFUNC(ne, !=, BOOL);
5710 tgl 165 LBC 0 : TSVECTORCMPFUNC(cmp, +, INT32);
5710 tgl 166 EUB :
167 : Datum
5710 tgl 168 GBC 45 : tsvector_strip(PG_FUNCTION_ARGS)
5710 tgl 169 EUB : {
5710 tgl 170 GIC 45 : TSVector in = PG_GETARG_TSVECTOR(0);
171 : TSVector out;
5710 tgl 172 ECB : int i,
5710 tgl 173 GIC 45 : len = 0;
5710 tgl 174 CBC 45 : WordEntry *arrin = ARRPTR(in),
175 : *arrout;
176 : char *cur;
5710 tgl 177 ECB :
5710 tgl 178 CBC 159 : for (i = 0; i < in->size; i++)
5693 teodor 179 GIC 114 : len += arrin[i].len;
180 :
5710 tgl 181 45 : len = CALCDATASIZE(in->size, len);
5710 tgl 182 CBC 45 : out = (TSVector) palloc0(len);
183 45 : SET_VARSIZE(out, len);
5710 tgl 184 GIC 45 : out->size = in->size;
5710 tgl 185 CBC 45 : arrout = ARRPTR(out);
186 45 : cur = STRPTR(out);
187 159 : for (i = 0; i < in->size; i++)
5710 tgl 188 ECB : {
5710 tgl 189 CBC 114 : memcpy(cur, STRPTR(in) + arrin[i].pos, arrin[i].len);
190 114 : arrout[i].haspos = 0;
191 114 : arrout[i].len = arrin[i].len;
5710 tgl 192 GIC 114 : arrout[i].pos = cur - STRPTR(out);
5693 teodor 193 CBC 114 : cur += arrout[i].len;
5710 tgl 194 ECB : }
195 :
5710 tgl 196 CBC 45 : PG_FREE_IF_COPY(in, 0);
197 45 : PG_RETURN_POINTER(out);
198 : }
199 :
5710 tgl 200 ECB : Datum
5710 tgl 201 CBC 5 : tsvector_length(PG_FUNCTION_ARGS)
202 : {
5710 tgl 203 GIC 5 : TSVector in = PG_GETARG_TSVECTOR(0);
3940 peter_e 204 5 : int32 ret = in->size;
5710 tgl 205 ECB :
5710 tgl 206 GIC 5 : PG_FREE_IF_COPY(in, 0);
5710 tgl 207 CBC 5 : PG_RETURN_INT32(ret);
5710 tgl 208 ECB : }
209 :
210 : Datum
5710 tgl 211 CBC 6 : tsvector_setweight(PG_FUNCTION_ARGS)
212 : {
5710 tgl 213 GIC 6 : TSVector in = PG_GETARG_TSVECTOR(0);
214 6 : char cw = PG_GETARG_CHAR(1);
5710 tgl 215 ECB : TSVector out;
216 : int i,
217 : j;
218 : WordEntry *entry;
219 : WordEntryPos *p;
5710 tgl 220 GIC 6 : int w = 0;
221 :
222 6 : switch (cw)
223 : {
5710 tgl 224 LBC 0 : case 'A':
225 : case 'a':
226 0 : w = 3;
5710 tgl 227 UIC 0 : break;
5710 tgl 228 UBC 0 : case 'B':
229 : case 'b':
230 0 : w = 2;
231 0 : break;
5710 tgl 232 GBC 6 : case 'C':
233 : case 'c':
234 6 : w = 1;
235 6 : break;
5710 tgl 236 LBC 0 : case 'D':
237 : case 'd':
238 0 : w = 0;
239 0 : break;
5710 tgl 240 UBC 0 : default:
241 : /* internal error */
5611 242 0 : elog(ERROR, "unrecognized weight: %d", cw);
5710 tgl 243 EUB : }
244 :
5710 tgl 245 GIC 6 : out = (TSVector) palloc(VARSIZE(in));
5710 tgl 246 GBC 6 : memcpy(out, in, VARSIZE(in));
5710 tgl 247 GIC 6 : entry = ARRPTR(out);
248 6 : i = out->size;
5710 tgl 249 CBC 30 : while (i--)
5710 tgl 250 ECB : {
5710 tgl 251 CBC 24 : if ((j = POSDATALEN(out, entry)) != 0)
5710 tgl 252 ECB : {
5710 tgl 253 CBC 24 : p = POSDATAPTR(out, entry);
5710 tgl 254 GIC 84 : while (j--)
5710 tgl 255 ECB : {
5710 tgl 256 GIC 60 : WEP_SETWEIGHT(*p, w);
5710 tgl 257 CBC 60 : p++;
5710 tgl 258 ECB : }
259 : }
5710 tgl 260 CBC 24 : entry++;
5710 tgl 261 ECB : }
262 :
5710 tgl 263 GIC 6 : PG_FREE_IF_COPY(in, 0);
5710 tgl 264 CBC 6 : PG_RETURN_POINTER(out);
265 : }
266 :
2585 teodor 267 ECB : /*
268 : * setweight(tsin tsvector, char_weight "char", lexemes "text"[])
269 : *
270 : * Assign weight w to elements of tsin that are listed in lexemes.
271 : */
272 : Datum
2585 teodor 273 GIC 12 : tsvector_setweight_by_filter(PG_FUNCTION_ARGS)
274 : {
275 12 : TSVector tsin = PG_GETARG_TSVECTOR(0);
276 12 : char char_weight = PG_GETARG_CHAR(1);
2585 teodor 277 CBC 12 : ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2);
278 :
2585 teodor 279 ECB : TSVector tsout;
280 : int i,
281 : j,
282 : nlexemes,
283 : weight;
284 : WordEntry *entry;
285 : Datum *dlexemes;
286 : bool *nulls;
287 :
2585 teodor 288 GIC 12 : switch (char_weight)
289 : {
2495 rhaas 290 UIC 0 : case 'A':
291 : case 'a':
2585 teodor 292 LBC 0 : weight = 3;
2585 teodor 293 UIC 0 : break;
2495 rhaas 294 UBC 0 : case 'B':
295 : case 'b':
2585 teodor 296 0 : weight = 2;
297 0 : break;
2495 rhaas 298 GBC 12 : case 'C':
299 : case 'c':
2585 teodor 300 12 : weight = 1;
301 12 : break;
2495 rhaas 302 LBC 0 : case 'D':
303 : case 'd':
2585 teodor 304 0 : weight = 0;
305 0 : break;
2585 teodor 306 UBC 0 : default:
307 : /* internal error */
308 0 : elog(ERROR, "unrecognized weight: %c", char_weight);
2585 teodor 309 EUB : }
310 :
2585 teodor 311 GIC 12 : tsout = (TSVector) palloc(VARSIZE(tsin));
2585 teodor 312 GBC 12 : memcpy(tsout, tsin, VARSIZE(tsin));
2585 teodor 313 GIC 12 : entry = ARRPTR(tsout);
314 :
282 peter 315 GNC 12 : deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlexemes);
2585 teodor 316 ECB :
317 : /*
2495 rhaas 318 : * Assuming that lexemes array is significantly shorter than tsvector we
319 : * can iterate through lexemes performing binary search of each lexeme
320 : * from lexemes in tsvector.
321 : */
2585 teodor 322 GIC 36 : for (i = 0; i < nlexemes; i++)
323 : {
324 : char *lex;
2495 rhaas 325 ECB : int lex_len,
326 : lex_pos;
327 :
328 : /* Ignore null array elements, they surely don't match */
2585 teodor 329 GIC 24 : if (nulls[i])
519 tgl 330 3 : continue;
331 :
2585 teodor 332 CBC 21 : lex = VARDATA(dlexemes[i]);
2219 noah 333 21 : lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
2585 teodor 334 GIC 21 : lex_pos = tsvector_bsearch(tsout, lex, lex_len);
2585 teodor 335 ECB :
2585 teodor 336 CBC 21 : if (lex_pos >= 0 && (j = POSDATALEN(tsout, entry + lex_pos)) != 0)
2585 teodor 337 ECB : {
2585 teodor 338 GIC 12 : WordEntryPos *p = POSDATAPTR(tsout, entry + lex_pos);
2495 rhaas 339 ECB :
2585 teodor 340 GIC 39 : while (j--)
2585 teodor 341 ECB : {
2585 teodor 342 GIC 27 : WEP_SETWEIGHT(*p, weight);
2585 teodor 343 CBC 27 : p++;
344 : }
2585 teodor 345 ECB : }
346 : }
347 :
2585 teodor 348 GIC 12 : PG_FREE_IF_COPY(tsin, 0);
349 12 : PG_FREE_IF_COPY(lexemes, 2);
350 :
2585 teodor 351 CBC 12 : PG_RETURN_POINTER(tsout);
2585 teodor 352 ECB : }
353 :
5441 tgl 354 : #define compareEntry(pa, a, pb, b) \
355 : tsCompareString((pa) + (a)->pos, (a)->len, \
356 : (pb) + (b)->pos, (b)->len, \
357 : false)
358 :
359 : /*
360 : * Add positions from src to dest after offsetting them by maxpos.
361 : * Return the number added (might be less than expected due to overflow)
362 : */
363 : static int32
5624 bruce 364 GIC 6 : add_pos(TSVector src, WordEntry *srcptr,
365 : TSVector dest, WordEntry *destptr,
366 : int32 maxpos)
5710 tgl 367 ECB : {
5689 teodor 368 GIC 6 : uint16 *clen = &_POSVECPTR(dest, destptr)->npos;
369 : int i;
5710 tgl 370 6 : uint16 slen = POSDATALEN(src, srcptr),
5710 tgl 371 ECB : startlen;
5710 tgl 372 GIC 6 : WordEntryPos *spos = POSDATAPTR(src, srcptr),
5710 tgl 373 CBC 6 : *dpos = POSDATAPTR(dest, destptr);
374 :
375 6 : if (!destptr->haspos)
5710 tgl 376 LBC 0 : *clen = 0;
377 :
5710 tgl 378 CBC 6 : startlen = *clen;
5647 tgl 379 GBC 6 : for (i = 0;
5647 tgl 380 GIC 12 : i < slen && *clen < MAXNUMPOS &&
5624 bruce 381 CBC 6 : (*clen == 0 || WEP_GETPOS(dpos[*clen - 1]) != MAXENTRYPOS - 1);
5647 tgl 382 6 : i++)
5710 tgl 383 ECB : {
5710 tgl 384 CBC 6 : WEP_SETWEIGHT(dpos[*clen], WEP_GETWEIGHT(spos[i]));
385 6 : WEP_SETPOS(dpos[*clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos));
5710 tgl 386 GIC 6 : (*clen)++;
5710 tgl 387 ECB : }
388 :
5710 tgl 389 CBC 6 : if (*clen != startlen)
5710 tgl 390 GIC 6 : destptr->haspos = 1;
391 6 : return *clen - startlen;
5710 tgl 392 ECB : }
393 :
2585 teodor 394 : /*
395 : * Perform binary search of given lexeme in TSVector.
396 : * Returns lexeme position in TSVector's entry array or -1 if lexeme wasn't
397 : * found.
398 : */
399 : static int
2585 teodor 400 GIC 99 : tsvector_bsearch(const TSVector tsv, char *lexeme, int lexeme_len)
401 : {
402 99 : WordEntry *arrin = ARRPTR(tsv);
2585 teodor 403 CBC 99 : int StopLow = 0,
2585 teodor 404 GIC 99 : StopHigh = tsv->size,
2585 teodor 405 ECB : StopMiddle,
406 : cmp;
407 :
2585 teodor 408 GIC 261 : while (StopLow < StopHigh)
409 : {
2495 rhaas 410 231 : StopMiddle = (StopLow + StopHigh) / 2;
2585 teodor 411 ECB :
2585 teodor 412 GIC 231 : cmp = tsCompareString(lexeme, lexeme_len,
2495 rhaas 413 CBC 231 : STRPTR(tsv) + arrin[StopMiddle].pos,
2495 rhaas 414 GIC 231 : arrin[StopMiddle].len,
2495 rhaas 415 ECB : false);
2585 teodor 416 :
2585 teodor 417 CBC 231 : if (cmp < 0)
2585 teodor 418 GIC 108 : StopHigh = StopMiddle;
419 123 : else if (cmp > 0)
2585 teodor 420 CBC 54 : StopLow = StopMiddle + 1;
2118 tgl 421 ECB : else /* found it */
2585 teodor 422 CBC 69 : return StopMiddle;
2585 teodor 423 ECB : }
424 :
2585 teodor 425 CBC 30 : return -1;
426 : }
427 :
2438 tgl 428 ECB : /*
429 : * qsort comparator functions
430 : */
431 :
432 : static int
2438 tgl 433 GIC 39 : compare_int(const void *va, const void *vb)
434 : {
435 39 : int a = *((const int *) va);
2438 tgl 436 CBC 39 : int b = *((const int *) vb);
437 :
2585 teodor 438 39 : if (a == b)
439 6 : return 0;
2585 teodor 440 GIC 33 : return (a > b) ? 1 : -1;
2585 teodor 441 ECB : }
442 :
2438 tgl 443 : static int
2438 tgl 444 GIC 51 : compare_text_lexemes(const void *va, const void *vb)
445 : {
446 51 : Datum a = *((const Datum *) va);
2438 tgl 447 CBC 51 : Datum b = *((const Datum *) vb);
2438 tgl 448 GIC 51 : char *alex = VARDATA_ANY(a);
2438 tgl 449 CBC 51 : int alex_len = VARSIZE_ANY_EXHDR(a);
450 51 : char *blex = VARDATA_ANY(b);
451 51 : int blex_len = VARSIZE_ANY_EXHDR(b);
2438 tgl 452 ECB :
2438 tgl 453 CBC 51 : return tsCompareString(alex, alex_len, blex, blex_len, false);
2438 tgl 454 ECB : }
455 :
2585 teodor 456 : /*
457 : * Internal routine to delete lexemes from TSVector by array of offsets.
458 : *
459 : * int *indices_to_delete -- array of lexeme offsets to delete (modified here!)
460 : * int indices_count -- size of that array
461 : *
462 : * Returns new TSVector without given lexemes along with their positions
463 : * and weights.
464 : */
465 : static TSVector
2585 teodor 466 GIC 33 : tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
467 : int indices_count)
468 : {
2585 teodor 469 ECB : TSVector tsout;
2585 teodor 470 GIC 33 : WordEntry *arrin = ARRPTR(tsv),
471 : *arrout;
472 33 : char *data = STRPTR(tsv),
2585 teodor 473 ECB : *dataout;
474 : int i, /* index in arrin */
2438 tgl 475 : j, /* index in arrout */
476 : k, /* index in indices_to_delete */
477 : curoff; /* index in dataout area */
478 :
479 : /*
480 : * Sort the filter array to simplify membership checks below. Also, get
481 : * rid of any duplicate entries, so that we can assume that indices_count
482 : * is exactly equal to the number of lexemes that will be removed.
483 : */
2585 teodor 484 GIC 33 : if (indices_count > 1)
485 : {
2438 tgl 486 15 : qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
1249 tmunro 487 CBC 15 : indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
488 : compare_int);
2438 tgl 489 ECB : }
2585 teodor 490 :
491 : /*
492 : * Here we overestimate tsout size, since we don't know how much space is
493 : * used by the deleted lexeme(s). We will set exact size below.
494 : */
2438 tgl 495 GIC 33 : tsout = (TSVector) palloc0(VARSIZE(tsv));
496 :
497 : /* This count must be correct because STRPTR(tsout) relies on it. */
2438 tgl 498 CBC 33 : tsout->size = tsv->size - indices_count;
499 :
500 : /*
2438 tgl 501 ECB : * Copy tsv to tsout, skipping lexemes listed in indices_to_delete.
502 : */
2438 tgl 503 GIC 33 : arrout = ARRPTR(tsout);
2585 teodor 504 33 : dataout = STRPTR(tsout);
2438 tgl 505 33 : curoff = 0;
2585 teodor 506 CBC 198 : for (i = j = k = 0; i < tsv->size; i++)
2585 teodor 507 ECB : {
508 : /*
2438 tgl 509 : * If current i is present in indices_to_delete, skip this lexeme.
510 : * Since indices_to_delete is already sorted, we only need to check
511 : * the current (k'th) entry.
512 : */
2495 rhaas 513 GIC 165 : if (k < indices_count && i == indices_to_delete[k])
514 : {
2585 teodor 515 48 : k++;
2585 teodor 516 CBC 48 : continue;
517 : }
2585 teodor 518 ECB :
2438 tgl 519 : /* Copy lexeme and its positions and weights */
2585 teodor 520 GIC 117 : memcpy(dataout + curoff, data + arrin[i].pos, arrin[i].len);
521 117 : arrout[j].haspos = arrin[i].haspos;
522 117 : arrout[j].len = arrin[i].len;
2585 teodor 523 CBC 117 : arrout[j].pos = curoff;
524 117 : curoff += arrin[i].len;
525 117 : if (arrin[i].haspos)
2585 teodor 526 ECB : {
2438 tgl 527 CBC 78 : int len = POSDATALEN(tsv, arrin + i) * sizeof(WordEntryPos)
528 78 : + sizeof(uint16);
529 :
2585 teodor 530 78 : curoff = SHORTALIGN(curoff);
531 78 : memcpy(dataout + curoff,
2585 teodor 532 GIC 78 : STRPTR(tsv) + SHORTALIGN(arrin[i].pos + arrin[i].len),
2585 teodor 533 ECB : len);
2585 teodor 534 CBC 78 : curoff += len;
2585 teodor 535 ECB : }
536 :
2585 teodor 537 CBC 117 : j++;
538 : }
539 :
2585 teodor 540 ECB : /*
541 : * k should now be exactly equal to indices_count. If it isn't then the
542 : * caller provided us with indices outside of [0, tsv->size) range and
543 : * estimation of tsout's size is wrong.
544 : */
2585 teodor 545 GIC 33 : Assert(k == indices_count);
546 :
547 33 : SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff));
2585 teodor 548 CBC 33 : return tsout;
549 : }
2585 teodor 550 ECB :
551 : /*
552 : * Delete given lexeme from tsvector.
553 : * Implementation of user-level ts_delete(tsvector, text).
554 : */
555 : Datum
2585 teodor 556 GIC 18 : tsvector_delete_str(PG_FUNCTION_ARGS)
557 : {
558 18 : TSVector tsin = PG_GETARG_TSVECTOR(0),
2585 teodor 559 ECB : tsout;
2219 noah 560 GIC 18 : text *tlexeme = PG_GETARG_TEXT_PP(1);
2219 noah 561 CBC 18 : char *lexeme = VARDATA_ANY(tlexeme);
2585 teodor 562 GIC 18 : int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme),
2585 teodor 563 ECB : skip_index;
564 :
2585 teodor 565 CBC 18 : if ((skip_index = tsvector_bsearch(tsin, lexeme, lexeme_len)) == -1)
2585 teodor 566 GIC 6 : PG_RETURN_POINTER(tsin);
567 :
2585 teodor 568 CBC 12 : tsout = tsvector_delete_by_indices(tsin, &skip_index, 1);
2585 teodor 569 ECB :
2585 teodor 570 GIC 12 : PG_FREE_IF_COPY(tsin, 0);
2585 teodor 571 CBC 12 : PG_FREE_IF_COPY(tlexeme, 1);
2585 teodor 572 GIC 12 : PG_RETURN_POINTER(tsout);
2585 teodor 573 ECB : }
574 :
575 : /*
576 : * Delete given array of lexemes from tsvector.
577 : * Implementation of user-level ts_delete(tsvector, text[]).
578 : */
579 : Datum
2585 teodor 580 GIC 21 : tsvector_delete_arr(PG_FUNCTION_ARGS)
581 : {
582 21 : TSVector tsin = PG_GETARG_TSVECTOR(0),
2585 teodor 583 ECB : tsout;
2585 teodor 584 GIC 21 : ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1);
2495 rhaas 585 ECB : int i,
586 : nlex,
2585 teodor 587 : skip_count,
588 : *skip_indices;
589 : Datum *dlexemes;
590 : bool *nulls;
591 :
282 peter 592 GNC 21 : deconstruct_array_builtin(lexemes, TEXTOID, &dlexemes, &nulls, &nlex);
593 :
2585 teodor 594 ECB : /*
595 : * In typical use case array of lexemes to delete is relatively small. So
596 : * here we optimize things for that scenario: iterate through lexarr
597 : * performing binary search of each lexeme from lexarr in tsvector.
598 : */
2585 teodor 599 GIC 21 : skip_indices = palloc0(nlex * sizeof(int));
600 84 : for (i = skip_count = 0; i < nlex; i++)
2585 teodor 601 ECB : {
2495 rhaas 602 : char *lex;
603 : int lex_len,
604 : lex_pos;
605 :
606 : /* Ignore null array elements, they surely don't match */
2585 teodor 607 GIC 63 : if (nulls[i])
519 tgl 608 3 : continue;
2585 teodor 609 ECB :
2219 noah 610 CBC 60 : lex = VARDATA(dlexemes[i]);
2219 noah 611 GIC 60 : lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
2585 teodor 612 CBC 60 : lex_pos = tsvector_bsearch(tsin, lex, lex_len);
2585 teodor 613 ECB :
2585 teodor 614 CBC 60 : if (lex_pos >= 0)
2585 teodor 615 GIC 39 : skip_indices[skip_count++] = lex_pos;
2585 teodor 616 ECB : }
617 :
2585 teodor 618 GIC 21 : tsout = tsvector_delete_by_indices(tsin, skip_indices, skip_count);
619 :
2585 teodor 620 CBC 21 : pfree(skip_indices);
2585 teodor 621 GIC 21 : PG_FREE_IF_COPY(tsin, 0);
2585 teodor 622 CBC 21 : PG_FREE_IF_COPY(lexemes, 1);
2585 teodor 623 ECB :
2585 teodor 624 CBC 21 : PG_RETURN_POINTER(tsout);
625 : }
2585 teodor 626 ECB :
627 : /*
628 : * Expand tsvector as table with following columns:
629 : * lexeme: lexeme text
630 : * positions: integer array of lexeme positions
631 : * weights: char array of weights corresponding to positions
632 : */
633 : Datum
2585 teodor 634 GIC 90 : tsvector_unnest(PG_FUNCTION_ARGS)
635 : {
2495 rhaas 636 ECB : FuncCallContext *funcctx;
637 : TSVector tsin;
638 :
2585 teodor 639 GIC 90 : if (SRF_IS_FIRSTCALL())
640 : {
2585 teodor 641 ECB : MemoryContext oldcontext;
642 : TupleDesc tupdesc;
643 :
2585 teodor 644 GIC 15 : funcctx = SRF_FIRSTCALL_INIT();
645 15 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2585 teodor 646 ECB :
1601 andres 647 CBC 15 : tupdesc = CreateTemplateTupleDesc(3);
2585 teodor 648 GIC 15 : TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme",
2585 teodor 649 ECB : TEXTOID, -1, 0);
2585 teodor 650 CBC 15 : TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions",
651 : INT2ARRAYOID, -1, 0);
652 15 : TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights",
653 : TEXTARRAYOID, -1, 0);
109 michael 654 GNC 15 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
109 michael 655 UNC 0 : elog(ERROR, "return type must be a row type");
109 michael 656 GNC 15 : funcctx->tuple_desc = tupdesc;
657 :
2585 teodor 658 CBC 15 : funcctx->user_fctx = PG_GETARG_TSVECTOR_COPY(0);
2585 teodor 659 EUB :
2585 teodor 660 CBC 15 : MemoryContextSwitchTo(oldcontext);
661 : }
2585 teodor 662 ECB :
2585 teodor 663 GIC 90 : funcctx = SRF_PERCALL_SETUP();
2585 teodor 664 CBC 90 : tsin = (TSVector) funcctx->user_fctx;
665 :
2585 teodor 666 GIC 90 : if (funcctx->call_cntr < tsin->size)
2585 teodor 667 ECB : {
2585 teodor 668 CBC 75 : WordEntry *arrin = ARRPTR(tsin);
2585 teodor 669 GIC 75 : char *data = STRPTR(tsin);
2585 teodor 670 ECB : HeapTuple tuple;
671 : int j,
2585 teodor 672 CBC 75 : i = funcctx->call_cntr;
673 75 : bool nulls[] = {false, false, false};
674 : Datum values[3];
675 :
1165 alvherre 676 75 : values[0] = PointerGetDatum(cstring_to_text_with_len(data + arrin[i].pos, arrin[i].len));
2585 teodor 677 ECB :
2585 teodor 678 GIC 75 : if (arrin[i].haspos)
679 : {
2585 teodor 680 ECB : WordEntryPosVector *posv;
681 : Datum *positions;
682 : Datum *weights;
683 : char weight;
684 :
685 : /*
686 : * Internally tsvector stores position and weight in the same
687 : * uint16 (2 bits for weight, 14 for position). Here we extract
688 : * that in two separate arrays.
689 : */
2585 teodor 690 GIC 45 : posv = _POSVECPTR(tsin, arrin + i);
691 45 : positions = palloc(posv->npos * sizeof(Datum));
2495 rhaas 692 45 : weights = palloc(posv->npos * sizeof(Datum));
2585 teodor 693 126 : for (j = 0; j < posv->npos; j++)
2585 teodor 694 ECB : {
2585 teodor 695 CBC 81 : positions[j] = Int16GetDatum(WEP_GETPOS(posv->pos[j]));
696 81 : weight = 'D' - WEP_GETWEIGHT(posv->pos[j]);
1165 alvherre 697 81 : weights[j] = PointerGetDatum(cstring_to_text_with_len(&weight,
698 : 1));
2585 teodor 699 ECB : }
700 :
282 peter 701 GNC 45 : values[1] = PointerGetDatum(construct_array_builtin(positions, posv->npos, INT2OID));
702 45 : values[2] = PointerGetDatum(construct_array_builtin(weights, posv->npos, TEXTOID));
2585 teodor 703 ECB : }
704 : else
705 : {
2585 teodor 706 GIC 30 : nulls[1] = nulls[2] = true;
707 : }
2585 teodor 708 ECB :
2585 teodor 709 GIC 75 : tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
710 75 : SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
2585 teodor 711 ECB : }
712 : else
713 : {
2585 teodor 714 GIC 15 : SRF_RETURN_DONE(funcctx);
715 : }
2585 teodor 716 ECB : }
717 :
718 : /*
719 : * Convert tsvector to array of lexemes.
720 : */
721 : Datum
2585 teodor 722 GIC 6 : tsvector_to_array(PG_FUNCTION_ARGS)
723 : {
2495 rhaas 724 CBC 6 : TSVector tsin = PG_GETARG_TSVECTOR(0);
2495 rhaas 725 GIC 6 : WordEntry *arrin = ARRPTR(tsin);
2495 rhaas 726 ECB : Datum *elements;
727 : int i;
728 : ArrayType *array;
729 :
2585 teodor 730 GIC 6 : elements = palloc(tsin->size * sizeof(Datum));
731 :
2585 teodor 732 CBC 36 : for (i = 0; i < tsin->size; i++)
733 : {
1165 alvherre 734 30 : elements[i] = PointerGetDatum(cstring_to_text_with_len(STRPTR(tsin) + arrin[i].pos,
1165 alvherre 735 GIC 30 : arrin[i].len));
2585 teodor 736 ECB : }
737 :
282 peter 738 GNC 6 : array = construct_array_builtin(elements, tsin->size, TEXTOID);
739 :
2585 teodor 740 CBC 6 : pfree(elements);
2585 teodor 741 GIC 6 : PG_FREE_IF_COPY(tsin, 0);
2585 teodor 742 CBC 6 : PG_RETURN_POINTER(array);
2585 teodor 743 ECB : }
744 :
745 : /*
746 : * Build tsvector from array of lexemes.
747 : */
748 : Datum
2585 teodor 749 GIC 12 : array_to_tsvector(PG_FUNCTION_ARGS)
750 : {
2585 teodor 751 CBC 12 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
752 : TSVector tsout;
2585 teodor 753 ECB : Datum *dlexemes;
754 : WordEntry *arrout;
755 : bool *nulls;
756 : int nitems,
757 : i,
758 : tslen,
2585 teodor 759 GIC 12 : datalen = 0;
760 : char *cur;
2585 teodor 761 ECB :
282 peter 762 GNC 12 : deconstruct_array_builtin(v, TEXTOID, &dlexemes, &nulls, &nitems);
763 :
519 tgl 764 ECB : /*
765 : * Reject nulls and zero length strings (maybe we should just ignore them,
766 : * instead?)
767 : */
2585 teodor 768 GIC 63 : for (i = 0; i < nitems; i++)
769 : {
2585 teodor 770 CBC 57 : if (nulls[i])
2585 teodor 771 GIC 3 : ereport(ERROR,
2438 tgl 772 ECB : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
2585 teodor 773 : errmsg("lexeme array may not contain nulls")));
774 :
519 tgl 775 GIC 54 : if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0)
776 3 : ereport(ERROR,
519 tgl 777 ECB : (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING),
778 : errmsg("lexeme array may not contain empty strings")));
779 : }
780 :
781 : /* Sort and de-dup, because this is required for a valid tsvector. */
2438 tgl 782 GIC 6 : if (nitems > 1)
783 : {
2438 tgl 784 CBC 6 : qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
1249 tmunro 785 GIC 6 : nitems = qunique(dlexemes, nitems, sizeof(Datum),
1249 tmunro 786 ECB : compare_text_lexemes);
2585 teodor 787 : }
788 :
789 : /* Calculate space needed for surviving lexemes. */
2438 tgl 790 GIC 30 : for (i = 0; i < nitems; i++)
2219 noah 791 24 : datalen += VARSIZE(dlexemes[i]) - VARHDRSZ;
2585 teodor 792 CBC 6 : tslen = CALCDATASIZE(nitems, datalen);
2438 tgl 793 ECB :
794 : /* Allocate and fill tsvector. */
2585 teodor 795 GIC 6 : tsout = (TSVector) palloc0(tslen);
796 6 : SET_VARSIZE(tsout, tslen);
2585 teodor 797 CBC 6 : tsout->size = nitems;
2438 tgl 798 ECB :
2585 teodor 799 CBC 6 : arrout = ARRPTR(tsout);
2585 teodor 800 GIC 6 : cur = STRPTR(tsout);
2585 teodor 801 CBC 30 : for (i = 0; i < nitems; i++)
2585 teodor 802 ECB : {
2219 noah 803 CBC 24 : char *lex = VARDATA(dlexemes[i]);
2219 noah 804 GIC 24 : int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ;
2585 teodor 805 ECB :
2585 teodor 806 CBC 24 : memcpy(cur, lex, lex_len);
2585 teodor 807 GIC 24 : arrout[i].haspos = 0;
2585 teodor 808 CBC 24 : arrout[i].len = lex_len;
809 24 : arrout[i].pos = cur - STRPTR(tsout);
810 24 : cur += lex_len;
2585 teodor 811 ECB : }
812 :
2585 teodor 813 GIC 6 : PG_FREE_IF_COPY(v, 0);
814 6 : PG_RETURN_POINTER(tsout);
2585 teodor 815 ECB : }
816 :
817 : /*
818 : * ts_filter(): keep only lexemes with given weights in tsvector.
819 : */
820 : Datum
2585 teodor 821 GIC 9 : tsvector_filter(PG_FUNCTION_ARGS)
822 : {
2585 teodor 823 CBC 9 : TSVector tsin = PG_GETARG_TSVECTOR(0),
824 : tsout;
825 9 : ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1);
2585 teodor 826 GIC 9 : WordEntry *arrin = ARRPTR(tsin),
2585 teodor 827 ECB : *arrout;
2585 teodor 828 CBC 9 : char *datain = STRPTR(tsin),
829 : *dataout;
2585 teodor 830 ECB : Datum *dweights;
831 : bool *nulls;
832 : int nweights;
833 : int i,
834 : j;
2531 teodor 835 GIC 9 : int cur_pos = 0;
836 9 : char mask = 0;
2585 teodor 837 ECB :
282 peter 838 GNC 9 : deconstruct_array_builtin(weights, CHAROID, &dweights, &nulls, &nweights);
2585 teodor 839 ECB :
2530 tgl 840 GIC 21 : for (i = 0; i < nweights; i++)
2585 teodor 841 ECB : {
842 : char char_weight;
843 :
2585 teodor 844 GIC 15 : if (nulls[i])
2585 teodor 845 CBC 3 : ereport(ERROR,
2438 tgl 846 ECB : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
847 : errmsg("weight array may not contain nulls")));
848 :
2585 teodor 849 GIC 12 : char_weight = DatumGetChar(dweights[i]);
2585 teodor 850 CBC 12 : switch (char_weight)
2585 teodor 851 ECB : {
2495 rhaas 852 GIC 9 : case 'A':
2495 rhaas 853 ECB : case 'a':
2585 teodor 854 GIC 9 : mask = mask | 8;
2585 teodor 855 CBC 9 : break;
2495 rhaas 856 3 : case 'B':
2495 rhaas 857 ECB : case 'b':
2585 teodor 858 GIC 3 : mask = mask | 4;
2585 teodor 859 CBC 3 : break;
2495 rhaas 860 LBC 0 : case 'C':
2495 rhaas 861 EUB : case 'c':
2585 teodor 862 UIC 0 : mask = mask | 2;
2585 teodor 863 UBC 0 : break;
2495 rhaas 864 0 : case 'D':
2495 rhaas 865 EUB : case 'd':
2585 teodor 866 UIC 0 : mask = mask | 1;
2585 teodor 867 UBC 0 : break;
868 0 : default:
2495 rhaas 869 0 : ereport(ERROR,
2495 rhaas 870 EUB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
871 : errmsg("unrecognized weight: \"%c\"", char_weight)));
872 : }
873 : }
874 :
2585 teodor 875 GIC 6 : tsout = (TSVector) palloc0(VARSIZE(tsin));
2585 teodor 876 CBC 6 : tsout->size = tsin->size;
877 6 : arrout = ARRPTR(tsout);
878 6 : dataout = STRPTR(tsout);
2585 teodor 879 ECB :
2585 teodor 880 GIC 54 : for (i = j = 0; i < tsin->size; i++)
2585 teodor 881 ECB : {
882 : WordEntryPosVector *posvin,
883 : *posvout;
2495 rhaas 884 GIC 48 : int npos = 0;
2495 rhaas 885 ECB : int k;
886 :
2585 teodor 887 GIC 48 : if (!arrin[i].haspos)
2585 teodor 888 CBC 15 : continue;
2585 teodor 889 ECB :
2495 rhaas 890 GIC 33 : posvin = _POSVECPTR(tsin, arrin + i);
2585 teodor 891 CBC 33 : posvout = (WordEntryPosVector *)
2495 rhaas 892 33 : (dataout + SHORTALIGN(cur_pos + arrin[i].len));
2585 teodor 893 ECB :
2585 teodor 894 GIC 66 : for (k = 0; k < posvin->npos; k++)
2585 teodor 895 ECB : {
2585 teodor 896 GIC 33 : if (mask & (1 << WEP_GETWEIGHT(posvin->pos[k])))
2585 teodor 897 CBC 15 : posvout->pos[npos++] = posvin->pos[k];
2585 teodor 898 ECB : }
899 :
900 : /* if no satisfactory positions found, skip lexeme */
2532 rhaas 901 GIC 33 : if (!npos)
2585 teodor 902 CBC 18 : continue;
2585 teodor 903 ECB :
2585 teodor 904 GIC 15 : arrout[j].haspos = true;
2585 teodor 905 CBC 15 : arrout[j].len = arrin[i].len;
906 15 : arrout[j].pos = cur_pos;
2585 teodor 907 ECB :
2585 teodor 908 GIC 15 : memcpy(dataout + cur_pos, datain + arrin[i].pos, arrin[i].len);
2585 teodor 909 CBC 15 : posvout->npos = npos;
910 15 : cur_pos += SHORTALIGN(arrin[i].len);
2495 rhaas 911 15 : cur_pos += POSDATALEN(tsout, arrout + j) * sizeof(WordEntryPos) +
2495 rhaas 912 ECB : sizeof(uint16);
2585 teodor 913 GIC 15 : j++;
2585 teodor 914 ECB : }
915 :
2585 teodor 916 GIC 6 : tsout->size = j;
2585 teodor 917 CBC 6 : if (dataout != STRPTR(tsout))
918 6 : memmove(STRPTR(tsout), dataout, cur_pos);
2585 teodor 919 ECB :
2585 teodor 920 GIC 6 : SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, cur_pos));
2585 teodor 921 ECB :
2585 teodor 922 GIC 6 : PG_FREE_IF_COPY(tsin, 0);
2585 teodor 923 CBC 6 : PG_RETURN_POINTER(tsout);
2585 teodor 924 ECB : }
925 :
926 : Datum
5710 tgl 927 GIC 6 : tsvector_concat(PG_FUNCTION_ARGS)
5710 tgl 928 ECB : {
5710 tgl 929 GIC 6 : TSVector in1 = PG_GETARG_TSVECTOR(0);
5710 tgl 930 CBC 6 : TSVector in2 = PG_GETARG_TSVECTOR(1);
5710 tgl 931 ECB : TSVector out;
932 : WordEntry *ptr;
933 : WordEntry *ptr1,
934 : *ptr2;
935 : WordEntryPos *p;
5710 tgl 936 GIC 6 : int maxpos = 0,
5710 tgl 937 ECB : i,
938 : j,
939 : i1,
940 : i2,
941 : dataoff,
942 : output_bytes,
943 : output_size;
944 : char *data,
945 : *data1,
946 : *data2;
947 :
948 : /* Get max position in in1; we'll need this to offset in2's positions */
5710 tgl 949 GIC 6 : ptr = ARRPTR(in1);
5710 tgl 950 CBC 6 : i = in1->size;
951 15 : while (i--)
5710 tgl 952 ECB : {
5710 tgl 953 GIC 9 : if ((j = POSDATALEN(in1, ptr)) != 0)
5710 tgl 954 ECB : {
5710 tgl 955 GIC 9 : p = POSDATAPTR(in1, ptr);
5710 tgl 956 CBC 18 : while (j--)
5710 tgl 957 ECB : {
5710 tgl 958 GIC 9 : if (WEP_GETPOS(*p) > maxpos)
5710 tgl 959 CBC 6 : maxpos = WEP_GETPOS(*p);
960 9 : p++;
5710 tgl 961 ECB : }
962 : }
5710 tgl 963 GIC 9 : ptr++;
5710 tgl 964 ECB : }
965 :
5710 tgl 966 GIC 6 : ptr1 = ARRPTR(in1);
5710 tgl 967 CBC 6 : ptr2 = ARRPTR(in2);
968 6 : data1 = STRPTR(in1);
969 6 : data2 = STRPTR(in2);
970 6 : i1 = in1->size;
971 6 : i2 = in2->size;
4244 tgl 972 ECB :
973 : /*
974 : * Conservative estimate of space needed. We might need all the data in
975 : * both inputs, and conceivably add a pad byte before position data for
976 : * each item where there was none before.
977 : */
4244 tgl 978 GIC 6 : output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 + i2;
4244 tgl 979 ECB :
4244 tgl 980 GIC 6 : out = (TSVector) palloc0(output_bytes);
4244 tgl 981 CBC 6 : SET_VARSIZE(out, output_bytes);
4244 tgl 982 ECB :
983 : /*
984 : * We must make out->size valid so that STRPTR(out) is sensible. We'll
985 : * collapse out any unused space at the end.
986 : */
5710 tgl 987 GIC 6 : out->size = in1->size + in2->size;
4244 tgl 988 ECB :
5710 tgl 989 GIC 6 : ptr = ARRPTR(out);
5647 tgl 990 CBC 6 : data = STRPTR(out);
991 6 : dataoff = 0;
5710 992 15 : while (i1 && i2)
5710 tgl 993 ECB : {
5710 tgl 994 GIC 9 : int cmp = compareEntry(data1, ptr1, data2, ptr2);
5710 tgl 995 ECB :
5710 tgl 996 GIC 9 : if (cmp < 0)
5710 tgl 997 ECB : { /* in1 first */
5710 tgl 998 GIC 3 : ptr->haspos = ptr1->haspos;
5710 tgl 999 CBC 3 : ptr->len = ptr1->len;
5647 1000 3 : memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1001 3 : ptr->pos = dataoff;
1002 3 : dataoff += ptr1->len;
5710 1003 3 : if (ptr->haspos)
5710 tgl 1004 ECB : {
5647 tgl 1005 GIC 3 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1006 CBC 3 : memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1007 3 : dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
5710 tgl 1008 ECB : }
1009 :
5710 tgl 1010 GIC 3 : ptr++;
5710 tgl 1011 CBC 3 : ptr1++;
1012 3 : i1--;
5710 tgl 1013 ECB : }
5710 tgl 1014 GIC 6 : else if (cmp > 0)
5710 tgl 1015 ECB : { /* in2 first */
5710 tgl 1016 GIC 3 : ptr->haspos = ptr2->haspos;
5710 tgl 1017 CBC 3 : ptr->len = ptr2->len;
5647 1018 3 : memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1019 3 : ptr->pos = dataoff;
1020 3 : dataoff += ptr2->len;
5710 1021 3 : if (ptr->haspos)
5710 tgl 1022 ECB : {
5710 tgl 1023 UIC 0 : int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
5710 tgl 1024 EUB :
5710 tgl 1025 UIC 0 : if (addlen == 0)
5710 tgl 1026 UBC 0 : ptr->haspos = 0;
5710 tgl 1027 EUB : else
1028 : {
5647 tgl 1029 UIC 0 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1030 UBC 0 : dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
5647 tgl 1031 EUB : }
1032 : }
1033 :
5710 tgl 1034 GIC 3 : ptr++;
5710 tgl 1035 CBC 3 : ptr2++;
1036 3 : i2--;
5710 tgl 1037 ECB : }
1038 : else
1039 : {
5710 tgl 1040 GIC 3 : ptr->haspos = ptr1->haspos | ptr2->haspos;
5710 tgl 1041 CBC 3 : ptr->len = ptr1->len;
5647 1042 3 : memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1043 3 : ptr->pos = dataoff;
1044 3 : dataoff += ptr1->len;
5710 1045 3 : if (ptr->haspos)
5710 tgl 1046 ECB : {
5710 tgl 1047 GIC 3 : if (ptr1->haspos)
5710 tgl 1048 ECB : {
5647 tgl 1049 GIC 3 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1050 CBC 3 : memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1051 3 : dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
5710 1052 3 : if (ptr2->haspos)
5647 1053 3 : dataoff += add_pos(in2, ptr2, out, ptr, maxpos) * sizeof(WordEntryPos);
5710 tgl 1054 ECB : }
1055 : else /* must have ptr2->haspos */
1056 : {
5710 tgl 1057 UIC 0 : int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
5710 tgl 1058 EUB :
5710 tgl 1059 UIC 0 : if (addlen == 0)
5710 tgl 1060 UBC 0 : ptr->haspos = 0;
5710 tgl 1061 EUB : else
1062 : {
5647 tgl 1063 UIC 0 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1064 UBC 0 : dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
5647 tgl 1065 EUB : }
1066 : }
1067 : }
1068 :
5710 tgl 1069 GIC 3 : ptr++;
5710 tgl 1070 CBC 3 : ptr1++;
1071 3 : ptr2++;
1072 3 : i1--;
1073 3 : i2--;
5710 tgl 1074 ECB : }
1075 : }
1076 :
5710 tgl 1077 GIC 9 : while (i1)
5710 tgl 1078 ECB : {
5710 tgl 1079 GIC 3 : ptr->haspos = ptr1->haspos;
5710 tgl 1080 CBC 3 : ptr->len = ptr1->len;
5647 1081 3 : memcpy(data + dataoff, data1 + ptr1->pos, ptr1->len);
1082 3 : ptr->pos = dataoff;
1083 3 : dataoff += ptr1->len;
5710 1084 3 : if (ptr->haspos)
5710 tgl 1085 ECB : {
5647 tgl 1086 GIC 3 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1087 CBC 3 : memcpy(data + dataoff, _POSVECPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
1088 3 : dataoff += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
5710 tgl 1089 ECB : }
1090 :
5710 tgl 1091 GIC 3 : ptr++;
5710 tgl 1092 CBC 3 : ptr1++;
1093 3 : i1--;
5710 tgl 1094 ECB : }
1095 :
5710 tgl 1096 GIC 9 : while (i2)
5710 tgl 1097 ECB : {
5710 tgl 1098 GIC 3 : ptr->haspos = ptr2->haspos;
5710 tgl 1099 CBC 3 : ptr->len = ptr2->len;
5647 1100 3 : memcpy(data + dataoff, data2 + ptr2->pos, ptr2->len);
1101 3 : ptr->pos = dataoff;
1102 3 : dataoff += ptr2->len;
5710 1103 3 : if (ptr->haspos)
5710 tgl 1104 ECB : {
5710 tgl 1105 GIC 3 : int addlen = add_pos(in2, ptr2, out, ptr, maxpos);
5710 tgl 1106 ECB :
5710 tgl 1107 GIC 3 : if (addlen == 0)
5710 tgl 1108 LBC 0 : ptr->haspos = 0;
5710 tgl 1109 EUB : else
1110 : {
5647 tgl 1111 GIC 3 : dataoff = SHORTALIGN(dataoff);
5647 tgl 1112 CBC 3 : dataoff += addlen * sizeof(WordEntryPos) + sizeof(uint16);
5647 tgl 1113 ECB : }
1114 : }
1115 :
5710 tgl 1116 GIC 3 : ptr++;
5710 tgl 1117 CBC 3 : ptr2++;
1118 3 : i2--;
5710 tgl 1119 ECB : }
1120 :
1121 : /*
1122 : * Instead of checking each offset individually, we check for overflow of
1123 : * pos fields once at the end.
1124 : */
5647 tgl 1125 GIC 6 : if (dataoff > MAXSTRPOS)
5647 tgl 1126 LBC 0 : ereport(ERROR,
5647 tgl 1127 EUB : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1128 : errmsg("string is too long for tsvector (%d bytes, max %d bytes)", dataoff, MAXSTRPOS)));
1129 :
1130 : /*
1131 : * Adjust sizes (asserting that we didn't overrun the original estimates)
1132 : * and collapse out any unused array entries.
1133 : */
4244 tgl 1134 GIC 6 : output_size = ptr - ARRPTR(out);
4244 tgl 1135 CBC 6 : Assert(output_size <= out->size);
1136 6 : out->size = output_size;
5710 1137 6 : if (data != STRPTR(out))
5647 1138 3 : memmove(STRPTR(out), data, dataoff);
4244 1139 6 : output_bytes = CALCDATASIZE(out->size, dataoff);
1140 6 : Assert(output_bytes <= VARSIZE(out));
1141 6 : SET_VARSIZE(out, output_bytes);
5710 tgl 1142 ECB :
5710 tgl 1143 GIC 6 : PG_FREE_IF_COPY(in1, 0);
5710 tgl 1144 CBC 6 : PG_FREE_IF_COPY(in2, 1);
1145 6 : PG_RETURN_POINTER(out);
5710 tgl 1146 ECB : }
1147 :
1148 : /*
1149 : * Compare two strings by tsvector rules.
1150 : *
1151 : * if prefix = true then it returns zero value iff b has prefix a
1152 : */
1153 : int32
5441 tgl 1154 GIC 3107679 : tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
5710 tgl 1155 ECB : {
1156 : int cmp;
1157 :
5050 bruce 1158 GIC 3107679 : if (lena == 0)
5441 tgl 1159 ECB : {
5050 bruce 1160 GIC 18 : if (prefix)
4473 tgl 1161 LBC 0 : cmp = 0; /* empty string is prefix of anything */
5441 tgl 1162 EUB : else
5050 bruce 1163 GIC 18 : cmp = (lenb > 0) ? -1 : 0;
5441 tgl 1164 ECB : }
5050 bruce 1165 GIC 3107661 : else if (lenb == 0)
5441 tgl 1166 ECB : {
5050 bruce 1167 UIC 0 : cmp = (lena > 0) ? 1 : 0;
5441 tgl 1168 EUB : }
1169 : else
1170 : {
418 tgl 1171 GIC 3107661 : cmp = memcmp(a, b, Min((unsigned int) lena, (unsigned int) lenb));
5710 tgl 1172 ECB :
5050 bruce 1173 GIC 3107661 : if (prefix)
5441 tgl 1174 ECB : {
5050 bruce 1175 GIC 8229 : if (cmp == 0 && lena > lenb)
4473 tgl 1176 LBC 0 : cmp = 1; /* a is longer, so not a prefix of b */
5441 tgl 1177 EUB : }
4473 tgl 1178 GIC 3099432 : else if (cmp == 0 && lena != lenb)
5441 tgl 1179 ECB : {
5441 tgl 1180 GIC 16134 : cmp = (lena < lenb) ? -1 : 1;
5441 tgl 1181 ECB : }
1182 : }
1183 :
5441 tgl 1184 GIC 3107679 : return cmp;
5710 tgl 1185 ECB : }
1186 :
1187 : /*
1188 : * Check weight info or/and fill 'data' with the required positions
1189 : */
1190 : static TSTernaryValue
2558 teodor 1191 GIC 34041 : checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
2558 teodor 1192 ECB : ExecPhraseData *data)
1193 : {
989 tgl 1194 GIC 34041 : TSTernaryValue result = TS_NO;
5689 teodor 1195 ECB :
989 tgl 1196 GIC 34041 : Assert(data == NULL || data->npos == 0);
989 tgl 1197 ECB :
989 tgl 1198 GIC 34041 : if (entry->haspos)
2558 teodor 1199 ECB : {
1200 : WordEntryPosVector *posvec;
1201 :
1202 : /*
1203 : * We can't use the _POSVECPTR macro here because the pointer to the
1204 : * tsvector's lexeme storage is already contained in chkval->values.
1205 : */
2558 teodor 1206 GIC 2244 : posvec = (WordEntryPosVector *)
2558 teodor 1207 CBC 2244 : (chkval->values + SHORTALIGN(entry->pos + entry->len));
2558 teodor 1208 ECB :
2558 teodor 1209 GIC 2244 : if (val->weight && data)
2558 teodor 1210 CBC 24 : {
2495 rhaas 1211 24 : WordEntryPos *posvec_iter = posvec->pos;
2495 rhaas 1212 ECB : WordEntryPos *dptr;
1213 :
1214 : /*
1215 : * Filter position information by weights
1216 : */
2558 teodor 1217 GIC 24 : dptr = data->pos = palloc(sizeof(WordEntryPos) * posvec->npos);
2558 teodor 1218 CBC 24 : data->allocated = true;
2558 teodor 1219 ECB :
1220 : /* Is there a position with a matching weight? */
2558 teodor 1221 GIC 48 : while (posvec_iter < posvec->pos + posvec->npos)
2558 teodor 1222 ECB : {
1223 : /* If true, append this position to the data->pos */
2558 teodor 1224 GIC 24 : if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
2558 teodor 1225 ECB : {
2558 teodor 1226 GIC 12 : *dptr = WEP_GETPOS(*posvec_iter);
2558 teodor 1227 CBC 12 : dptr++;
2558 teodor 1228 ECB : }
1229 :
2558 teodor 1230 GIC 24 : posvec_iter++;
2558 teodor 1231 ECB : }
1232 :
2558 teodor 1233 GIC 24 : data->npos = dptr - data->pos;
5710 tgl 1234 ECB :
2558 teodor 1235 GIC 24 : if (data->npos > 0)
989 tgl 1236 CBC 12 : result = TS_YES;
989 tgl 1237 ECB : else
1238 : {
989 tgl 1239 GIC 12 : pfree(data->pos);
989 tgl 1240 CBC 12 : data->pos = NULL;
1241 12 : data->allocated = false;
989 tgl 1242 ECB : }
1243 : }
2558 teodor 1244 GIC 2220 : else if (val->weight)
2558 teodor 1245 ECB : {
2495 rhaas 1246 GIC 228 : WordEntryPos *posvec_iter = posvec->pos;
2558 teodor 1247 ECB :
1248 : /* Is there a position with a matching weight? */
2558 teodor 1249 GIC 345 : while (posvec_iter < posvec->pos + posvec->npos)
2558 teodor 1250 ECB : {
2558 teodor 1251 GIC 252 : if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
2558 teodor 1252 ECB : {
989 tgl 1253 GIC 135 : result = TS_YES;
2495 rhaas 1254 CBC 135 : break; /* no need to go further */
2558 teodor 1255 ECB : }
1256 :
2558 teodor 1257 GIC 117 : posvec_iter++;
2558 teodor 1258 ECB : }
1259 : }
989 tgl 1260 GIC 1992 : else if (data)
2558 teodor 1261 ECB : {
2558 teodor 1262 GIC 1137 : data->npos = posvec->npos;
2495 rhaas 1263 CBC 1137 : data->pos = posvec->pos;
2558 teodor 1264 1137 : data->allocated = false;
989 tgl 1265 1137 : result = TS_YES;
989 tgl 1266 ECB : }
1267 : else
1268 : {
1269 : /* simplest case: no weight check, positions not needed */
989 tgl 1270 GIC 855 : result = TS_YES;
2558 teodor 1271 ECB : }
1272 : }
1273 : else
1274 : {
1275 : /*
1276 : * Position info is lacking, so if the caller requires it, we can only
1277 : * say that maybe there is a match.
1278 : *
1279 : * Notice, however, that we *don't* check val->weight here.
1280 : * Historically, stripped tsvectors are considered to match queries
1281 : * whether or not the query has a weight restriction; that's a little
1282 : * dubious but we'll preserve the behavior.
1283 : */
989 tgl 1284 GIC 31797 : if (data)
989 tgl 1285 CBC 11529 : result = TS_MAYBE;
989 tgl 1286 ECB : else
989 tgl 1287 GIC 20268 : result = TS_YES;
5710 tgl 1288 ECB : }
1289 :
2558 teodor 1290 GIC 34041 : return result;
2558 teodor 1291 ECB : }
1292 :
1293 : /*
1294 : * TS_execute callback for matching a tsquery operand to plain tsvector data
1295 : */
1296 : static TSTernaryValue
2558 teodor 1297 GIC 142011 : checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
5710 tgl 1298 ECB : {
5624 bruce 1299 GIC 142011 : CHKVAL *chkval = (CHKVAL *) checkval;
5693 teodor 1300 CBC 142011 : WordEntry *StopLow = chkval->arrb;
1301 142011 : WordEntry *StopHigh = chkval->arre;
5441 tgl 1302 142011 : WordEntry *StopMiddle = StopHigh;
989 1303 142011 : TSTernaryValue res = TS_NO;
5710 tgl 1304 ECB :
1305 : /* Loop invariant: StopLow <= val < StopHigh */
5710 tgl 1306 GIC 893403 : while (StopLow < StopHigh)
5710 tgl 1307 ECB : {
1308 : int difference;
1309 :
5710 tgl 1310 GIC 777915 : StopMiddle = StopLow + (StopHigh - StopLow) / 2;
2558 teodor 1311 CBC 777915 : difference = tsCompareString(chkval->operand + val->distance,
1312 777915 : val->length,
1313 777915 : chkval->values + StopMiddle->pos,
1314 777915 : StopMiddle->len,
5050 bruce 1315 ECB : false);
1316 :
5710 tgl 1317 GIC 777915 : if (difference == 0)
5441 tgl 1318 ECB : {
1319 : /* Check weight info & fill 'data' with positions */
2558 teodor 1320 GIC 26523 : res = checkclass_str(chkval, StopMiddle, val, data);
5441 tgl 1321 CBC 26523 : break;
5441 tgl 1322 ECB : }
5441 tgl 1323 GIC 751392 : else if (difference > 0)
5710 tgl 1324 CBC 423756 : StopLow = StopMiddle + 1;
5710 tgl 1325 ECB : else
5710 tgl 1326 GIC 327636 : StopHigh = StopMiddle;
5710 tgl 1327 ECB : }
1328 :
1329 : /*
1330 : * If it's a prefix search, we should also consider lexemes that the
1331 : * search term is a prefix of (which will necessarily immediately follow
1332 : * the place we found in the above loop). But we can skip them if there
1333 : * was a definite match on the exact term AND the caller doesn't need
1334 : * position info.
1335 : */
989 tgl 1336 GIC 142011 : if (val->prefix && (res != TS_YES || data))
5441 tgl 1337 ECB : {
2495 rhaas 1338 GIC 8262 : WordEntryPos *allpos = NULL;
2495 rhaas 1339 CBC 8262 : int npos = 0,
1340 8262 : totalpos = 0;
2495 rhaas 1341 ECB :
1342 : /* adjust start position for corner case */
5050 bruce 1343 GIC 8262 : if (StopLow >= StopHigh)
5441 tgl 1344 CBC 8256 : StopMiddle = StopHigh;
5441 tgl 1345 ECB :
1346 : /* we don't try to re-use any data from the initial match */
989 tgl 1347 GIC 8262 : if (data)
989 tgl 1348 ECB : {
989 tgl 1349 GIC 18 : if (data->allocated)
989 tgl 1350 LBC 0 : pfree(data->pos);
989 tgl 1351 GBC 18 : data->pos = NULL;
989 tgl 1352 CBC 18 : data->allocated = false;
1353 18 : data->npos = 0;
989 tgl 1354 ECB : }
989 tgl 1355 GIC 8262 : res = TS_NO;
989 tgl 1356 ECB :
989 tgl 1357 GIC 15729 : while ((res != TS_YES || data) &&
989 tgl 1358 CBC 23745 : StopMiddle < chkval->arre &&
2558 teodor 1359 7965 : tsCompareString(chkval->operand + val->distance,
1360 7965 : val->length,
1361 7965 : chkval->values + StopMiddle->pos,
1362 7965 : StopMiddle->len,
5050 bruce 1363 ECB : true) == 0)
1364 : {
1365 : TSTernaryValue subres;
1366 :
989 tgl 1367 GIC 7518 : subres = checkclass_str(chkval, StopMiddle, val, data);
2558 teodor 1368 ECB :
989 tgl 1369 GIC 7518 : if (subres != TS_NO)
989 tgl 1370 ECB : {
989 tgl 1371 GIC 7488 : if (data)
2558 teodor 1372 ECB : {
1373 : /*
1374 : * We need to join position information
1375 : */
989 tgl 1376 GIC 21 : if (subres == TS_MAYBE)
989 tgl 1377 ECB : {
1378 : /*
1379 : * No position info for this match, so we must report
1380 : * MAYBE overall.
1381 : */
989 tgl 1382 UIC 0 : res = TS_MAYBE;
989 tgl 1383 EUB : /* forget any previous positions */
989 tgl 1384 UIC 0 : npos = 0;
989 tgl 1385 EUB : /* don't leak storage */
989 tgl 1386 UIC 0 : if (allpos)
989 tgl 1387 UBC 0 : pfree(allpos);
1388 0 : break;
989 tgl 1389 EUB : }
1390 :
989 tgl 1391 GIC 39 : while (npos + data->npos > totalpos)
2558 teodor 1392 ECB : {
2558 teodor 1393 GIC 18 : if (totalpos == 0)
2558 teodor 1394 ECB : {
2558 teodor 1395 GIC 18 : totalpos = 256;
2558 teodor 1396 CBC 18 : allpos = palloc(sizeof(WordEntryPos) * totalpos);
2558 teodor 1397 ECB : }
1398 : else
1399 : {
2558 teodor 1400 UIC 0 : totalpos *= 2;
2558 teodor 1401 UBC 0 : allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos);
2558 teodor 1402 EUB : }
1403 : }
1404 :
2558 teodor 1405 GIC 21 : memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
2558 teodor 1406 CBC 21 : npos += data->npos;
989 tgl 1407 ECB :
1408 : /* don't leak storage from individual matches */
989 tgl 1409 GIC 21 : if (data->allocated)
989 tgl 1410 CBC 12 : pfree(data->pos);
1411 21 : data->pos = NULL;
1412 21 : data->allocated = false;
989 tgl 1413 ECB : /* it's important to reset data->npos before next loop */
989 tgl 1414 GIC 21 : data->npos = 0;
2558 teodor 1415 ECB : }
1416 : else
1417 : {
1418 : /* Don't need positions, just handle YES/MAYBE */
989 tgl 1419 GIC 7467 : if (subres == TS_YES || res == TS_NO)
989 tgl 1420 CBC 7467 : res = subres;
1067 tgl 1421 ECB : }
1422 : }
1423 :
5441 tgl 1424 GIC 7518 : StopMiddle++;
5441 tgl 1425 ECB : }
1426 :
989 tgl 1427 GIC 8262 : if (data && npos > 0)
2558 teodor 1428 ECB : {
1429 : /* Sort and make unique array of found positions */
2558 teodor 1430 GIC 18 : data->pos = allpos;
1249 tmunro 1431 CBC 18 : qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
1432 18 : data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
1249 tmunro 1433 ECB : compareWordEntryPos);
2558 teodor 1434 GIC 18 : data->allocated = true;
989 tgl 1435 CBC 18 : res = TS_YES;
2558 teodor 1436 ECB : }
1437 : }
1438 :
5050 bruce 1439 GIC 142011 : return res;
5710 tgl 1440 ECB : }
1441 :
1442 : /*
1443 : * Compute output position list for a tsquery operator in phrase mode.
1444 : *
1445 : * Merge the position lists in Ldata and Rdata as specified by "emit",
1446 : * returning the result list into *data. The input position lists must be
1447 : * sorted and unique, and the output will be as well.
1448 : *
1449 : * data: pointer to initially-all-zeroes output struct, or NULL
1450 : * Ldata, Rdata: input position lists
1451 : * emit: bitmask of TSPO_XXX flags
1452 : * Loffset: offset to be added to Ldata positions before comparing/outputting
1453 : * Roffset: offset to be added to Rdata positions before comparing/outputting
1454 : * max_npos: maximum possible required size of output position array
1455 : *
1456 : * Loffset and Roffset should not be negative, else we risk trying to output
1457 : * negative positions, which won't fit into WordEntryPos.
1458 : *
1459 : * The result is boolean (TS_YES or TS_NO), but for the caller's convenience
1460 : * we return it as TSTernaryValue.
1461 : *
1462 : * Returns TS_YES if any positions were emitted to *data; or if data is NULL,
1463 : * returns TS_YES if any positions would have been emitted.
1464 : */
1465 : #define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */
1466 : #define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */
1467 : #define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */
1468 :
1469 : static TSTernaryValue
2300 tgl 1470 GIC 14982 : TS_phrase_output(ExecPhraseData *data,
2300 tgl 1471 ECB : ExecPhraseData *Ldata,
1472 : ExecPhraseData *Rdata,
1473 : int emit,
1474 : int Loffset,
1475 : int Roffset,
1476 : int max_npos)
1477 : {
1478 : int Lindex,
1479 : Rindex;
1480 :
1481 : /* Loop until both inputs are exhausted */
2300 tgl 1482 GIC 14982 : Lindex = Rindex = 0;
2300 tgl 1483 CBC 15498 : while (Lindex < Ldata->npos || Rindex < Rdata->npos)
2300 tgl 1484 ECB : {
1485 : int Lpos,
1486 : Rpos;
2300 tgl 1487 GIC 1167 : int output_pos = 0;
2300 tgl 1488 ECB :
1489 : /*
1490 : * Fetch current values to compare. WEP_GETPOS() is needed because
1491 : * ExecPhraseData->data can point to a tsvector's WordEntryPosVector.
1492 : */
2300 tgl 1493 GIC 1167 : if (Lindex < Ldata->npos)
2300 tgl 1494 CBC 843 : Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset;
2300 tgl 1495 ECB : else
1496 : {
1497 : /* L array exhausted, so we're done if R_ONLY isn't set */
2300 tgl 1498 GIC 324 : if (!(emit & TSPO_R_ONLY))
2300 tgl 1499 CBC 75 : break;
1500 249 : Lpos = INT_MAX;
2300 tgl 1501 ECB : }
2300 tgl 1502 GIC 1092 : if (Rindex < Rdata->npos)
2300 tgl 1503 CBC 969 : Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset;
2300 tgl 1504 ECB : else
1505 : {
1506 : /* R array exhausted, so we're done if L_ONLY isn't set */
2300 tgl 1507 GIC 123 : if (!(emit & TSPO_L_ONLY))
2300 tgl 1508 CBC 81 : break;
1509 42 : Rpos = INT_MAX;
2300 tgl 1510 ECB : }
1511 :
1512 : /* Merge-join the two input lists */
2300 tgl 1513 GIC 1011 : if (Lpos < Rpos)
2300 tgl 1514 ECB : {
1515 : /* Lpos is not matched in Rdata, should we output it? */
2300 tgl 1516 GIC 243 : if (emit & TSPO_L_ONLY)
2300 tgl 1517 CBC 72 : output_pos = Lpos;
1518 243 : Lindex++;
2300 tgl 1519 ECB : }
2300 tgl 1520 GIC 768 : else if (Lpos == Rpos)
2300 tgl 1521 ECB : {
1522 : /* Lpos and Rpos match ... should we output it? */
2300 tgl 1523 GIC 399 : if (emit & TSPO_BOTH)
2300 tgl 1524 CBC 351 : output_pos = Rpos;
1525 399 : Lindex++;
1526 399 : Rindex++;
2300 tgl 1527 ECB : }
1528 : else /* Lpos > Rpos */
1529 : {
1530 : /* Rpos is not matched in Ldata, should we output it? */
2300 tgl 1531 GIC 369 : if (emit & TSPO_R_ONLY)
2300 tgl 1532 CBC 270 : output_pos = Rpos;
1533 369 : Rindex++;
2300 tgl 1534 ECB : }
1535 :
2300 tgl 1536 GIC 1011 : if (output_pos > 0)
2300 tgl 1537 ECB : {
2300 tgl 1538 GIC 693 : if (data)
2300 tgl 1539 ECB : {
1540 : /* Store position, first allocating output array if needed */
2300 tgl 1541 GIC 198 : if (data->pos == NULL)
2300 tgl 1542 ECB : {
2300 tgl 1543 GIC 159 : data->pos = (WordEntryPos *)
2300 tgl 1544 CBC 159 : palloc(max_npos * sizeof(WordEntryPos));
1545 159 : data->allocated = true;
2300 tgl 1546 ECB : }
2300 tgl 1547 GIC 198 : data->pos[data->npos++] = output_pos;
2300 tgl 1548 ECB : }
1549 : else
1550 : {
1551 : /*
1552 : * Exact positions not needed, so return TS_YES as soon as we
1553 : * know there is at least one.
1554 : */
1077 tgl 1555 GIC 495 : return TS_YES;
2300 tgl 1556 ECB : }
1557 : }
1558 : }
1559 :
2300 tgl 1560 GIC 14487 : if (data && data->npos > 0)
2300 tgl 1561 ECB : {
1562 : /* Let's assert we didn't overrun the array */
2300 tgl 1563 GIC 159 : Assert(data->npos <= max_npos);
1077 tgl 1564 CBC 159 : return TS_YES;
2300 tgl 1565 ECB : }
1077 tgl 1566 GIC 14328 : return TS_NO;
2300 tgl 1567 ECB : }
1568 :
1569 : /*
1570 : * Execute tsquery at or below an OP_PHRASE operator.
1571 : *
1572 : * This handles tsquery execution at recursion levels where we need to care
1573 : * about match locations.
1574 : *
1575 : * In addition to the same arguments used for TS_execute, the caller may pass
1576 : * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme
1577 : * match position info on success. data == NULL if no position data need be
1578 : * returned.
1579 : * Note: the function assumes data != NULL for operators other than OP_PHRASE.
1580 : * This is OK because an outside call always starts from an OP_PHRASE node,
1581 : * and all internal recursion cases pass data != NULL.
1582 : *
1583 : * The detailed semantics of the match data, given that the function returned
1584 : * TS_YES (successful match), are:
1585 : *
1586 : * npos > 0, negate = false:
1587 : * query is matched at specified position(s) (and only those positions)
1588 : * npos > 0, negate = true:
1589 : * query is matched at all positions *except* specified position(s)
1590 : * npos = 0, negate = true:
1591 : * query is matched at all positions
1592 : * npos = 0, negate = false:
1593 : * disallowed (this should result in TS_NO or TS_MAYBE, as appropriate)
1594 : *
1595 : * Successful matches also return a "width" value which is the match width in
1596 : * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches,
1597 : * and is the sum of the phrase operator distances for phrase matches. Note
1598 : * that when width > 0, the listed positions represent the ends of matches not
1599 : * the starts. (This unintuitive rule is needed to avoid possibly generating
1600 : * negative positions, which wouldn't fit into the WordEntryPos arrays.)
1601 : *
1602 : * If the TSExecuteCallback function reports that an operand is present
1603 : * but fails to provide position(s) for it, we will return TS_MAYBE when
1604 : * it is possible but not certain that the query is matched.
1605 : *
1606 : * When the function returns TS_NO or TS_MAYBE, it must return npos = 0,
1607 : * negate = false (which is the state initialized by the caller); but the
1608 : * "width" output in such cases is undefined.
1609 : */
1610 : static TSTernaryValue
2305 tgl 1611 GIC 351271 : TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
2300 tgl 1612 ECB : TSExecuteCallback chkcond,
1613 : ExecPhraseData *data)
1614 : {
1615 : ExecPhraseData Ldata,
1616 : Rdata;
1617 : TSTernaryValue lmatch,
1618 : rmatch;
1619 : int Loffset,
1620 : Roffset,
1621 : maxwidth;
1622 :
1623 : /* since this function recurses, it could be driven to stack overflow */
2558 teodor 1624 GIC 351271 : check_stack_depth();
2558 teodor 1625 ECB :
1626 : /* ... and let's check for query cancel while we're at it */
139 tgl 1627 GIC 351271 : CHECK_FOR_INTERRUPTS();
139 tgl 1628 ECB :
2558 teodor 1629 GIC 351271 : if (curitem->type == QI_VAL)
989 tgl 1630 CBC 172801 : return chkcond(arg, (QueryOperand *) curitem, data);
2300 tgl 1631 ECB :
2300 tgl 1632 GIC 178470 : switch (curitem->qoperator.oper)
2558 teodor 1633 ECB : {
2300 tgl 1634 GIC 60471 : case OP_NOT:
2558 teodor 1635 ECB :
1636 : /*
1637 : * We need not touch data->width, since a NOT operation does not
1638 : * change the match width.
1639 : */
989 tgl 1640 GIC 60471 : if (flags & TS_EXEC_SKIP_NOT)
2300 tgl 1641 ECB : {
1642 : /* with SKIP_NOT, report NOT as "match everywhere" */
2300 tgl 1643 UIC 0 : Assert(data->npos == 0 && !data->negate);
2300 tgl 1644 UBC 0 : data->negate = true;
1077 1645 0 : return TS_YES;
2300 tgl 1646 EUB : }
1077 tgl 1647 GIC 60471 : switch (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data))
1077 tgl 1648 ECB : {
1077 tgl 1649 GIC 52853 : case TS_NO:
1077 tgl 1650 ECB : /* change "match nowhere" to "match everywhere" */
1077 tgl 1651 GIC 52853 : Assert(data->npos == 0 && !data->negate);
1077 tgl 1652 CBC 52853 : data->negate = true;
1653 52853 : return TS_YES;
1654 195 : case TS_YES:
1655 195 : if (data->npos > 0)
1077 tgl 1656 ECB : {
1657 : /* we have some positions, invert negate flag */
1077 tgl 1658 GIC 192 : data->negate = !data->negate;
1077 tgl 1659 CBC 192 : return TS_YES;
1077 tgl 1660 ECB : }
1077 tgl 1661 GIC 3 : else if (data->negate)
1077 tgl 1662 ECB : {
1663 : /* change "match everywhere" to "match nowhere" */
1077 tgl 1664 GIC 3 : data->negate = false;
1077 tgl 1665 CBC 3 : return TS_NO;
1077 tgl 1666 ECB : }
1667 : /* Should not get here if result was TS_YES */
1077 tgl 1668 UIC 0 : Assert(false);
1077 tgl 1669 EUB : break;
1077 tgl 1670 GIC 7423 : case TS_MAYBE:
1077 tgl 1671 ECB : /* match positions are, and remain, uncertain */
1077 tgl 1672 GIC 7423 : return TS_MAYBE;
1077 tgl 1673 ECB : }
1077 tgl 1674 UIC 0 : break;
2558 teodor 1675 EUB :
2300 tgl 1676 GIC 117921 : case OP_PHRASE:
2300 tgl 1677 ECB : case OP_AND:
2300 tgl 1678 GIC 117921 : memset(&Ldata, 0, sizeof(Ldata));
2300 tgl 1679 CBC 117921 : memset(&Rdata, 0, sizeof(Rdata));
2558 teodor 1680 ECB :
1077 tgl 1681 GIC 117921 : lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
1077 tgl 1682 ECB : arg, flags, chkcond, &Ldata);
1077 tgl 1683 GIC 117921 : if (lmatch == TS_NO)
1077 tgl 1684 CBC 62993 : return TS_NO;
2558 teodor 1685 ECB :
1077 tgl 1686 GIC 54928 : rmatch = TS_phrase_execute(curitem + 1,
1077 tgl 1687 ECB : arg, flags, chkcond, &Rdata);
1077 tgl 1688 GIC 54928 : if (rmatch == TS_NO)
1077 tgl 1689 CBC 27110 : return TS_NO;
2558 teodor 1690 ECB :
1691 : /*
1692 : * If either operand has no position information, then we can't
1693 : * return reliable position data, only a MAYBE result.
1694 : */
1077 tgl 1695 GIC 27818 : if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1077 tgl 1696 CBC 12914 : return TS_MAYBE;
2558 teodor 1697 ECB :
2300 tgl 1698 GIC 14904 : if (curitem->qoperator.oper == OP_PHRASE)
2300 tgl 1699 ECB : {
1700 : /*
1701 : * Compute Loffset and Roffset suitable for phrase match, and
1702 : * compute overall width of whole phrase match.
1703 : */
2300 tgl 1704 GIC 14901 : Loffset = curitem->qoperator.distance + Rdata.width;
2300 tgl 1705 CBC 14901 : Roffset = 0;
1706 14901 : if (data)
1707 93 : data->width = curitem->qoperator.distance +
1708 93 : Ldata.width + Rdata.width;
2300 tgl 1709 ECB : }
1710 : else
1711 : {
1712 : /*
1713 : * For OP_AND, set output width and alignment like OP_OR (see
1714 : * comment below)
1715 : */
2300 tgl 1716 GIC 3 : maxwidth = Max(Ldata.width, Rdata.width);
2300 tgl 1717 CBC 3 : Loffset = maxwidth - Ldata.width;
1718 3 : Roffset = maxwidth - Rdata.width;
1719 3 : if (data)
1720 3 : data->width = maxwidth;
2300 tgl 1721 ECB : }
1722 :
2300 tgl 1723 GIC 14904 : if (Ldata.negate && Rdata.negate)
2558 teodor 1724 ECB : {
1725 : /* !L & !R: treat as !(L | R) */
2300 tgl 1726 GIC 14217 : (void) TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1727 ECB : TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1728 : Loffset, Roffset,
2300 tgl 1729 GIC 14217 : Ldata.npos + Rdata.npos);
2300 tgl 1730 CBC 14217 : if (data)
2300 tgl 1731 LBC 0 : data->negate = true;
1077 tgl 1732 GBC 14217 : return TS_YES;
2300 tgl 1733 ECB : }
2300 tgl 1734 GIC 687 : else if (Ldata.negate)
2300 tgl 1735 ECB : {
1736 : /* !L & R */
2300 tgl 1737 GIC 225 : return TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1738 ECB : TSPO_R_ONLY,
1739 : Loffset, Roffset,
1740 : Rdata.npos);
1741 : }
2300 tgl 1742 GIC 462 : else if (Rdata.negate)
2300 tgl 1743 ECB : {
1744 : /* L & !R */
2300 tgl 1745 GIC 3 : return TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1746 ECB : TSPO_L_ONLY,
1747 : Loffset, Roffset,
1748 : Ldata.npos);
1749 : }
1750 : else
1751 : {
1752 : /* straight AND */
2300 tgl 1753 GIC 459 : return TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1754 ECB : TSPO_BOTH,
1755 : Loffset, Roffset,
2300 tgl 1756 GIC 459 : Min(Ldata.npos, Rdata.npos));
2300 tgl 1757 ECB : }
1758 :
2300 tgl 1759 GIC 78 : case OP_OR:
2300 tgl 1760 CBC 78 : memset(&Ldata, 0, sizeof(Ldata));
1761 78 : memset(&Rdata, 0, sizeof(Rdata));
2477 teodor 1762 ECB :
2300 tgl 1763 GIC 78 : lmatch = TS_phrase_execute(curitem + curitem->qoperator.left,
2300 tgl 1764 ECB : arg, flags, chkcond, &Ldata);
2300 tgl 1765 GIC 78 : rmatch = TS_phrase_execute(curitem + 1,
2300 tgl 1766 ECB : arg, flags, chkcond, &Rdata);
1767 :
1077 tgl 1768 GIC 78 : if (lmatch == TS_NO && rmatch == TS_NO)
1077 tgl 1769 CBC 6 : return TS_NO;
2558 teodor 1770 ECB :
1771 : /*
1772 : * If either operand has no position information, then we can't
1773 : * return reliable position data, only a MAYBE result.
1774 : */
1077 tgl 1775 GIC 72 : if (lmatch == TS_MAYBE || rmatch == TS_MAYBE)
1077 tgl 1776 LBC 0 : return TS_MAYBE;
2558 teodor 1777 EUB :
1778 : /*
1779 : * Cope with undefined output width from failed submatch. (This
1780 : * takes less code than trying to ensure that all failure returns
1781 : * set data->width to zero.)
1782 : */
1077 tgl 1783 GIC 72 : if (lmatch == TS_NO)
2300 tgl 1784 CBC 9 : Ldata.width = 0;
1077 1785 72 : if (rmatch == TS_NO)
2300 1786 42 : Rdata.width = 0;
2558 teodor 1787 ECB :
1788 : /*
1789 : * For OP_AND and OP_OR, report the width of the wider of the two
1790 : * inputs, and align the narrower input's positions to the right
1791 : * end of that width. This rule deals at least somewhat
1792 : * reasonably with cases like "x <-> (y | z <-> q)".
1793 : */
2300 tgl 1794 GIC 72 : maxwidth = Max(Ldata.width, Rdata.width);
2300 tgl 1795 CBC 72 : Loffset = maxwidth - Ldata.width;
1796 72 : Roffset = maxwidth - Rdata.width;
1797 72 : data->width = maxwidth;
2300 tgl 1798 ECB :
2300 tgl 1799 GIC 72 : if (Ldata.negate && Rdata.negate)
2300 tgl 1800 ECB : {
1801 : /* !L | !R: treat as !(L & R) */
2300 tgl 1802 GIC 3 : (void) TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1803 ECB : TSPO_BOTH,
1804 : Loffset, Roffset,
2300 tgl 1805 GIC 3 : Min(Ldata.npos, Rdata.npos));
2300 tgl 1806 CBC 3 : data->negate = true;
1077 1807 3 : return TS_YES;
2300 tgl 1808 ECB : }
2300 tgl 1809 GIC 69 : else if (Ldata.negate)
2300 tgl 1810 ECB : {
1811 : /* !L | R: treat as !(L & !R) */
2300 tgl 1812 GIC 15 : (void) TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1813 ECB : TSPO_L_ONLY,
1814 : Loffset, Roffset,
1815 : Ldata.npos);
2300 tgl 1816 GIC 15 : data->negate = true;
1077 tgl 1817 CBC 15 : return TS_YES;
2300 tgl 1818 ECB : }
2300 tgl 1819 GIC 54 : else if (Rdata.negate)
2300 tgl 1820 ECB : {
1821 : /* L | !R: treat as !(!L & R) */
2300 tgl 1822 GIC 3 : (void) TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1823 ECB : TSPO_R_ONLY,
1824 : Loffset, Roffset,
1825 : Rdata.npos);
2300 tgl 1826 GIC 3 : data->negate = true;
1077 tgl 1827 CBC 3 : return TS_YES;
2300 tgl 1828 ECB : }
1829 : else
1830 : {
1831 : /* straight OR */
2300 tgl 1832 GIC 51 : return TS_phrase_output(data, &Ldata, &Rdata,
2300 tgl 1833 ECB : TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
1834 : Loffset, Roffset,
2300 tgl 1835 GIC 51 : Ldata.npos + Rdata.npos);
2300 tgl 1836 ECB : }
1837 :
2300 tgl 1838 UIC 0 : default:
2300 tgl 1839 UBC 0 : elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
2558 teodor 1840 EUB : }
1841 :
1842 : /* not reachable, but keep compiler quiet */
1077 tgl 1843 UIC 0 : return TS_NO;
2558 teodor 1844 EUB : }
1845 :
1846 :
1847 : /*
1848 : * Evaluate tsquery boolean expression.
1849 : *
1850 : * curitem: current tsquery item (initially, the first one)
1851 : * arg: opaque value to pass through to callback function
1852 : * flags: bitmask of flag bits shown in ts_utils.h
1853 : * chkcond: callback function to check whether a primitive value is present
1854 : */
1855 : bool
2305 tgl 1856 GIC 259957 : TS_execute(QueryItem *curitem, void *arg, uint32 flags,
2305 tgl 1857 ECB : TSExecuteCallback chkcond)
1858 : {
1859 : /*
1860 : * If we get TS_MAYBE from the recursion, return true. We could only see
1861 : * that result if the caller passed TS_EXEC_PHRASE_NO_POS, so there's no
1862 : * need to check again.
1863 : */
1077 tgl 1864 GIC 259957 : return TS_execute_recurse(curitem, arg, flags, chkcond) != TS_NO;
1077 tgl 1865 ECB : }
1866 :
1867 : /*
1868 : * Evaluate tsquery boolean expression.
1869 : *
1870 : * This is the same as TS_execute except that TS_MAYBE is returned as-is.
1871 : */
1872 : TSTernaryValue
782 tgl 1873 GIC 18471 : TS_execute_ternary(QueryItem *curitem, void *arg, uint32 flags,
782 tgl 1874 ECB : TSExecuteCallback chkcond)
1875 : {
782 tgl 1876 GIC 18471 : return TS_execute_recurse(curitem, arg, flags, chkcond);
782 tgl 1877 ECB : }
1878 :
1879 : /*
1880 : * TS_execute recursion for operators above any phrase operator. Here we do
1881 : * not need to worry about lexeme positions. As soon as we hit an OP_PHRASE
1882 : * operator, we pass it off to TS_phrase_execute which does worry.
1883 : */
1884 : static TSTernaryValue
1077 tgl 1885 GIC 527408 : TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
1077 tgl 1886 ECB : TSExecuteCallback chkcond)
1887 : {
1888 : TSTernaryValue lmatch;
1889 :
1890 : /* since this function recurses, it could be driven to stack overflow */
5700 tgl 1891 GIC 527408 : check_stack_depth();
5700 tgl 1892 ECB :
1893 : /* ... and let's check for query cancel while we're at it */
982 tgl 1894 GIC 527408 : CHECK_FOR_INTERRUPTS();
982 tgl 1895 ECB :
5693 teodor 1896 GIC 527408 : if (curitem->type == QI_VAL)
2305 tgl 1897 CBC 211662 : return chkcond(arg, (QueryOperand *) curitem,
989 tgl 1898 ECB : NULL /* don't need position info */ );
1899 :
5015 peter_e 1900 GIC 315746 : switch (curitem->qoperator.oper)
5710 tgl 1901 ECB : {
5693 teodor 1902 GIC 101628 : case OP_NOT:
989 tgl 1903 CBC 101628 : if (flags & TS_EXEC_SKIP_NOT)
1077 tgl 1904 LBC 0 : return TS_YES;
1077 tgl 1905 GBC 101628 : switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1077 tgl 1906 ECB : {
1077 tgl 1907 GIC 95879 : case TS_NO:
1077 tgl 1908 CBC 95879 : return TS_YES;
1909 2445 : case TS_YES:
1910 2445 : return TS_NO;
1911 3304 : case TS_MAYBE:
1912 3304 : return TS_MAYBE;
1077 tgl 1913 ECB : }
1077 tgl 1914 UIC 0 : break;
4473 tgl 1915 EUB :
5693 teodor 1916 GIC 41870 : case OP_AND:
1077 tgl 1917 CBC 41870 : lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1077 tgl 1918 ECB : flags, chkcond);
1077 tgl 1919 GIC 41870 : if (lmatch == TS_NO)
1077 tgl 1920 CBC 33264 : return TS_NO;
1921 8606 : switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1077 tgl 1922 ECB : {
1077 tgl 1923 GIC 5058 : case TS_NO:
1077 tgl 1924 CBC 5058 : return TS_NO;
1925 1650 : case TS_YES:
1926 1650 : return lmatch;
1927 1898 : case TS_MAYBE:
1928 1898 : return TS_MAYBE;
1077 tgl 1929 ECB : }
1077 tgl 1930 UIC 0 : break;
5693 teodor 1931 EUB :
5693 teodor 1932 GIC 54483 : case OP_OR:
1077 tgl 1933 CBC 54483 : lmatch = TS_execute_recurse(curitem + curitem->qoperator.left, arg,
1077 tgl 1934 ECB : flags, chkcond);
1077 tgl 1935 GIC 54483 : if (lmatch == TS_YES)
1077 tgl 1936 CBC 12090 : return TS_YES;
1937 42393 : switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
1077 tgl 1938 ECB : {
1077 tgl 1939 GIC 28753 : case TS_NO:
1077 tgl 1940 CBC 28753 : return lmatch;
1941 3708 : case TS_YES:
1942 3708 : return TS_YES;
1943 9932 : case TS_MAYBE:
1944 9932 : return TS_MAYBE;
1077 tgl 1945 ECB : }
1077 tgl 1946 UIC 0 : break;
5693 teodor 1947 EUB :
2558 teodor 1948 GIC 117765 : case OP_PHRASE:
1077 tgl 1949 ECB :
1950 : /*
1951 : * If we get a MAYBE result, and the caller doesn't want that,
1952 : * convert it to NO. It would be more consistent, perhaps, to
1953 : * return the result of TS_phrase_execute() verbatim and then
1954 : * convert MAYBE results at the top of the recursion. But
1955 : * converting at the topmost phrase operator gives results that
1956 : * are bug-compatible with the old implementation, so do it like
1957 : * this for now.
1958 : */
1077 tgl 1959 GIC 117765 : switch (TS_phrase_execute(curitem, arg, flags, chkcond, NULL))
1077 tgl 1960 ECB : {
1077 tgl 1961 GIC 90199 : case TS_NO:
1077 tgl 1962 CBC 90199 : return TS_NO;
1963 14655 : case TS_YES:
1964 14655 : return TS_YES;
1965 12911 : case TS_MAYBE:
1966 12911 : return (flags & TS_EXEC_PHRASE_NO_POS) ? TS_MAYBE : TS_NO;
1077 tgl 1967 ECB : }
1077 tgl 1968 UIC 0 : break;
2558 teodor 1969 EUB :
5693 teodor 1970 UIC 0 : default:
5015 peter_e 1971 UBC 0 : elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
4473 tgl 1972 EUB : }
1973 :
1974 : /* not reachable, but keep compiler quiet */
1077 tgl 1975 UIC 0 : return TS_NO;
4473 tgl 1976 EUB : }
1977 :
1978 : /*
1979 : * Evaluate tsquery and report locations of matching terms.
1980 : *
1981 : * This is like TS_execute except that it returns match locations not just
1982 : * success/failure status. The callback function is required to provide
1983 : * position data (we report failure if it doesn't).
1984 : *
1985 : * On successful match, the result is a List of ExecPhraseData structs, one
1986 : * for each AND'ed term or phrase operator in the query. Each struct includes
1987 : * a sorted array of lexeme positions matching that term. (Recall that for
1988 : * phrase operators, the match includes width+1 lexemes, and the recorded
1989 : * position is that of the rightmost lexeme.)
1990 : *
1991 : * OR subexpressions are handled by union'ing their match locations into a
1992 : * single List element, which is valid since any of those locations contains
1993 : * a match. However, when some of the OR'ed terms are phrase operators, we
1994 : * report the maximum width of any of the OR'ed terms, making such cases
1995 : * slightly imprecise in the conservative direction. (For example, if the
1996 : * tsquery is "(A <-> B) | C", an occurrence of C in the data would be
1997 : * reported as though it includes the lexeme to the left of C.)
1998 : *
1999 : * Locations of NOT subexpressions are not reported. (Obviously, there can
2000 : * be no successful NOT matches at top level, or the match would have failed.
2001 : * So this amounts to ignoring NOTs underneath ORs.)
2002 : *
2003 : * The result is NIL if no match, or if position data was not returned.
2004 : *
2005 : * Arguments are the same as for TS_execute, although flags is currently
2006 : * vestigial since none of the defined bits are sensible here.
2007 : */
2008 : List *
80 tgl 2009 GNC 181 : TS_execute_locations(QueryItem *curitem, void *arg,
2010 : uint32 flags,
2011 : TSExecuteCallback chkcond)
2012 : {
2013 : List *result;
2014 :
2015 : /* No flags supported, as yet */
2016 181 : Assert(flags == TS_EXEC_EMPTY);
2017 181 : if (TS_execute_locations_recurse(curitem, arg, chkcond, &result))
2018 64 : return result;
2019 117 : return NIL;
2020 : }
2021 :
2022 : /*
2023 : * TS_execute_locations recursion for operators above any phrase operator.
2024 : * OP_PHRASE subexpressions can be passed off to TS_phrase_execute.
2025 : */
2026 : static bool
2027 535 : TS_execute_locations_recurse(QueryItem *curitem, void *arg,
2028 : TSExecuteCallback chkcond,
2029 : List **locations)
2030 : {
2031 : bool lmatch,
2032 : rmatch;
2033 : List *llocations,
2034 : *rlocations;
2035 : ExecPhraseData *data;
2036 :
2037 : /* since this function recurses, it could be driven to stack overflow */
2038 535 : check_stack_depth();
2039 :
2040 : /* ... and let's check for query cancel while we're at it */
2041 535 : CHECK_FOR_INTERRUPTS();
2042 :
2043 : /* Default locations result is empty */
2044 535 : *locations = NIL;
2045 :
2046 535 : if (curitem->type == QI_VAL)
2047 : {
2048 223 : data = palloc0_object(ExecPhraseData);
2049 223 : if (chkcond(arg, (QueryOperand *) curitem, data) == TS_YES)
2050 : {
2051 106 : *locations = list_make1(data);
2052 106 : return true;
2053 : }
2054 117 : pfree(data);
2055 117 : return false;
2056 : }
2057 :
2058 312 : switch (curitem->qoperator.oper)
2059 : {
2060 6 : case OP_NOT:
2061 6 : if (!TS_execute_locations_recurse(curitem + 1, arg, chkcond,
2062 : &llocations))
80 tgl 2063 UNC 0 : return true; /* we don't pass back any locations */
80 tgl 2064 GNC 6 : return false;
2065 :
2066 264 : case OP_AND:
2067 264 : if (!TS_execute_locations_recurse(curitem + curitem->qoperator.left,
2068 : arg, chkcond,
2069 : &llocations))
2070 204 : return false;
2071 60 : if (!TS_execute_locations_recurse(curitem + 1,
2072 : arg, chkcond,
2073 : &rlocations))
2074 27 : return false;
2075 33 : *locations = list_concat(llocations, rlocations);
2076 33 : return true;
2077 :
2078 12 : case OP_OR:
2079 12 : lmatch = TS_execute_locations_recurse(curitem + curitem->qoperator.left,
2080 : arg, chkcond,
2081 : &llocations);
2082 12 : rmatch = TS_execute_locations_recurse(curitem + 1,
2083 : arg, chkcond,
2084 : &rlocations);
2085 12 : if (lmatch || rmatch)
2086 : {
2087 : /*
2088 : * We generate an AND'able location struct from each
2089 : * combination of sub-matches, following the disjunctive law
2090 : * (A & B) | (C & D) = (A | C) & (A | D) & (B | C) & (B | D).
2091 : *
2092 : * However, if either input didn't produce locations (i.e., it
2093 : * failed or was a NOT), we must just return the other list.
2094 : */
2095 12 : if (llocations == NIL)
80 tgl 2096 UNC 0 : *locations = rlocations;
80 tgl 2097 GNC 12 : else if (rlocations == NIL)
2098 6 : *locations = llocations;
2099 : else
2100 : {
2101 : ListCell *ll;
2102 :
2103 12 : foreach(ll, llocations)
2104 : {
2105 6 : ExecPhraseData *ldata = (ExecPhraseData *) lfirst(ll);
2106 : ListCell *lr;
2107 :
2108 12 : foreach(lr, rlocations)
2109 : {
2110 6 : ExecPhraseData *rdata = (ExecPhraseData *) lfirst(lr);
2111 :
2112 6 : data = palloc0_object(ExecPhraseData);
2113 6 : (void) TS_phrase_output(data, ldata, rdata,
2114 : TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY,
2115 : 0, 0,
2116 6 : ldata->npos + rdata->npos);
2117 : /* Report the larger width, as explained above. */
2118 6 : data->width = Max(ldata->width, rdata->width);
2119 6 : *locations = lappend(*locations, data);
2120 : }
2121 : }
2122 : }
2123 :
2124 12 : return true;
2125 : }
80 tgl 2126 UNC 0 : return false;
2127 :
80 tgl 2128 GNC 30 : case OP_PHRASE:
2129 : /* We can hand this off to TS_phrase_execute */
2130 30 : data = palloc0_object(ExecPhraseData);
2131 30 : if (TS_phrase_execute(curitem, arg, TS_EXEC_EMPTY, chkcond,
2132 : data) == TS_YES)
2133 : {
2134 30 : if (!data->negate)
2135 30 : *locations = list_make1(data);
2136 30 : return true;
2137 : }
80 tgl 2138 UNC 0 : pfree(data);
2139 0 : return false;
2140 :
2141 0 : default:
2142 0 : elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
2143 : }
2144 :
2145 : /* not reachable, but keep compiler quiet */
2146 : return false;
2147 : }
2148 :
2149 : /*
2150 : * Detect whether a tsquery boolean expression requires any positive matches
2151 : * to values shown in the tsquery.
2152 : *
2153 : * This is needed to know whether a GIN index search requires full index scan.
2154 : * For example, 'x & !y' requires a match of x, so it's sufficient to scan
2155 : * entries for x; but 'x | !y' could match rows containing neither x nor y.
2156 : */
2157 : bool
4473 tgl 2158 GIC 417 : tsquery_requires_match(QueryItem *curitem)
2159 : {
2160 : /* since this function recurses, it could be driven to stack overflow */
2161 417 : check_stack_depth();
2162 :
2163 417 : if (curitem->type == QI_VAL)
2164 198 : return true;
2165 :
2166 219 : switch (curitem->qoperator.oper)
2167 : {
2168 84 : case OP_NOT:
2169 :
2170 : /*
2171 : * Assume there are no required matches underneath a NOT. For
2172 : * some cases with nested NOTs, we could prove there's a required
2173 : * match, but it seems unlikely to be worth the trouble.
2174 : */
2175 84 : return false;
2176 :
2558 teodor 2177 102 : case OP_PHRASE:
2178 :
2179 : /*
2180 : * Treat OP_PHRASE as OP_AND here
2558 teodor 2181 ECB : */
2182 : case OP_AND:
2183 : /* If either side requires a match, we're good */
4473 tgl 2184 GIC 102 : if (tsquery_requires_match(curitem + curitem->qoperator.left))
2185 78 : return true;
2186 : else
2187 24 : return tsquery_requires_match(curitem + 1);
4473 tgl 2188 ECB :
4473 tgl 2189 CBC 33 : case OP_OR:
4473 tgl 2190 ECB : /* Both sides must require a match */
4473 tgl 2191 CBC 33 : if (tsquery_requires_match(curitem + curitem->qoperator.left))
4473 tgl 2192 GIC 33 : return tsquery_requires_match(curitem + 1);
2193 : else
4473 tgl 2194 UIC 0 : return false;
2195 :
2196 0 : default:
2197 0 : elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
2198 : }
5693 teodor 2199 ECB :
2200 : /* not reachable, but keep compiler quiet */
2201 : return false;
2202 : }
2203 :
2204 : /*
2205 : * boolean operations
2206 : */
2207 : Datum
5710 tgl 2208 GIC 30 : ts_match_qv(PG_FUNCTION_ARGS)
2209 : {
5710 tgl 2210 CBC 30 : PG_RETURN_DATUM(DirectFunctionCall2(ts_match_vq,
2211 : PG_GETARG_DATUM(1),
2212 : PG_GETARG_DATUM(0)));
5710 tgl 2213 ECB : }
2214 :
2215 : Datum
5710 tgl 2216 CBC 110040 : ts_match_vq(PG_FUNCTION_ARGS)
2217 : {
2218 110040 : TSVector val = PG_GETARG_TSVECTOR(0);
5710 tgl 2219 GIC 110040 : TSQuery query = PG_GETARG_TSQUERY(1);
5710 tgl 2220 ECB : CHKVAL chkval;
2221 : bool result;
2222 :
2264 2223 : /* empty query matches nothing */
2264 tgl 2224 CBC 110040 : if (!query->size)
2225 : {
5710 tgl 2226 LBC 0 : PG_FREE_IF_COPY(val, 0);
2227 0 : PG_FREE_IF_COPY(query, 1);
5710 tgl 2228 UIC 0 : PG_RETURN_BOOL(false);
2229 : }
5710 tgl 2230 ECB :
5710 tgl 2231 GIC 110040 : chkval.arrb = ARRPTR(val);
5710 tgl 2232 CBC 110040 : chkval.arre = chkval.arrb + val->size;
2233 110040 : chkval.values = STRPTR(val);
5710 tgl 2234 GIC 110040 : chkval.operand = GETOPERAND(query);
2305 tgl 2235 GBC 110040 : result = TS_execute(GETQUERY(query),
5710 tgl 2236 ECB : &chkval,
2237 : TS_EXEC_EMPTY,
2305 2238 : checkcondition_str);
5710 2239 :
5710 tgl 2240 GIC 110040 : PG_FREE_IF_COPY(val, 0);
2241 110040 : PG_FREE_IF_COPY(query, 1);
5710 tgl 2242 CBC 110040 : PG_RETURN_BOOL(result);
5710 tgl 2243 ECB : }
2244 :
2245 : Datum
5710 tgl 2246 LBC 0 : ts_match_tt(PG_FUNCTION_ARGS)
5710 tgl 2247 ECB : {
2248 : TSVector vector;
2249 : TSQuery query;
2250 : bool res;
2251 :
5710 tgl 2252 UIC 0 : vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
2253 : PG_GETARG_DATUM(0)));
5710 tgl 2254 LBC 0 : query = DatumGetTSQuery(DirectFunctionCall1(plainto_tsquery,
2255 : PG_GETARG_DATUM(1)));
2256 :
2257 0 : res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2258 : TSVectorGetDatum(vector),
2259 : TSQueryGetDatum(query)));
2260 :
5710 tgl 2261 UIC 0 : pfree(vector);
2262 0 : pfree(query);
2263 :
2264 0 : PG_RETURN_BOOL(res);
2265 : }
2266 :
5710 tgl 2267 ECB : Datum
5710 tgl 2268 UBC 0 : ts_match_tq(PG_FUNCTION_ARGS)
5710 tgl 2269 ECB : {
2270 : TSVector vector;
5710 tgl 2271 UIC 0 : TSQuery query = PG_GETARG_TSQUERY(1);
2272 : bool res;
2273 :
2274 0 : vector = DatumGetTSVector(DirectFunctionCall1(to_tsvector,
5710 tgl 2275 ECB : PG_GETARG_DATUM(0)));
2276 :
5710 tgl 2277 LBC 0 : res = DatumGetBool(DirectFunctionCall2(ts_match_vq,
2278 : TSVectorGetDatum(vector),
2279 : TSQueryGetDatum(query)));
5710 tgl 2280 ECB :
5710 tgl 2281 UIC 0 : pfree(vector);
5710 tgl 2282 LBC 0 : PG_FREE_IF_COPY(query, 1);
2283 :
2284 0 : PG_RETURN_BOOL(res);
5710 tgl 2285 ECB : }
2286 :
2287 : /*
5689 teodor 2288 : * ts_stat statistic function support
2289 : */
2290 :
2291 :
2292 : /*
2293 : * Returns the number of positions in value 'wptr' within tsvector 'txt',
2294 : * that have a weight equal to one of the weights in 'weight' bitmask.
2295 : */
5710 tgl 2296 : static int
5624 bruce 2297 GIC 4089 : check_weight(TSVector txt, WordEntry *wptr, int8 weight)
5710 tgl 2298 EUB : {
5710 tgl 2299 GIC 4089 : int len = POSDATALEN(txt, wptr);
5710 tgl 2300 CBC 4089 : int num = 0;
5710 tgl 2301 GIC 4089 : WordEntryPos *ptr = POSDATAPTR(txt, wptr);
5710 tgl 2302 ECB :
5710 tgl 2303 CBC 8325 : while (len--)
2304 : {
5710 tgl 2305 GIC 4236 : if (weight & (1 << WEP_GETWEIGHT(*ptr)))
5710 tgl 2306 CBC 6 : num++;
2307 4236 : ptr++;
5710 tgl 2308 ECB : }
5710 tgl 2309 GIC 4089 : return num;
5710 tgl 2310 EUB : }
2311 :
2312 : #define compareStatWord(a,e,t) \
5256 teodor 2313 : tsCompareString((a)->lexeme, (a)->lenlexeme, \
5441 tgl 2314 : STRPTR(t) + (e)->pos, (e)->len, \
2315 : false)
2316 :
2317 : static void
5256 teodor 2318 GIC 172812 : insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt, uint32 off)
2319 : {
5050 bruce 2320 172812 : WordEntry *we = ARRPTR(txt) + off;
2321 172812 : StatEntry *node = stat->root,
2322 172812 : *pnode = NULL;
2323 : int n,
5254 teodor 2324 172812 : res = 0;
5050 bruce 2325 172812 : uint32 depth = 1;
2326 :
2327 172812 : if (stat->weight == 0)
5256 teodor 2328 86406 : n = (we->haspos) ? POSDATALEN(txt, we) : 1;
2329 : else
5256 teodor 2330 CBC 86406 : n = (we->haspos) ? check_weight(txt, we, stat->weight) : 0;
2331 :
5050 bruce 2332 GIC 172812 : if (n == 0)
5050 bruce 2333 CBC 86403 : return; /* nothing to insert */
2334 :
2335 872691 : while (node)
5256 teodor 2336 ECB : {
5256 teodor 2337 GIC 869259 : res = compareStatWord(node, we, txt);
5710 tgl 2338 ECB :
5256 teodor 2339 GIC 869259 : if (res == 0)
5256 teodor 2340 ECB : {
5256 teodor 2341 GIC 82977 : break;
2342 : }
2343 : else
2344 : {
2345 786282 : pnode = node;
5050 bruce 2346 786282 : node = (res < 0) ? node->left : node->right;
5256 teodor 2347 ECB : }
5256 teodor 2348 GIC 786282 : depth++;
5256 teodor 2349 ECB : }
2350 :
5256 teodor 2351 GIC 86409 : if (depth > stat->maxdepth)
2352 63 : stat->maxdepth = depth;
2353 :
2354 86409 : if (node == NULL)
2355 : {
5050 bruce 2356 CBC 3432 : node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len);
5256 teodor 2357 3432 : node->left = node->right = NULL;
5256 teodor 2358 GIC 3432 : node->ndoc = 1;
5256 teodor 2359 CBC 3432 : node->nentry = n;
5256 teodor 2360 GIC 3432 : node->lenlexeme = we->len;
5256 teodor 2361 CBC 3432 : memcpy(node->lexeme, STRPTR(txt) + we->pos, node->lenlexeme);
2362 :
5050 bruce 2363 3432 : if (pnode == NULL)
5710 tgl 2364 ECB : {
5256 teodor 2365 GIC 6 : stat->root = node;
5710 tgl 2366 EUB : }
2367 : else
5256 teodor 2368 : {
5256 teodor 2369 GBC 3426 : if (res < 0)
5256 teodor 2370 GIC 1690 : pnode->left = node;
2371 : else
2372 1736 : pnode->right = node;
2373 : }
2374 : }
2375 : else
2376 : {
2377 82977 : node->ndoc++;
2378 82977 : node->nentry += n;
2379 : }
5256 teodor 2380 ECB : }
2381 :
2382 : static void
5050 bruce 2383 GIC 247692 : chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector txt,
2384 : uint32 low, uint32 high, uint32 offset)
2385 : {
2386 : uint32 pos;
2387 247692 : uint32 middle = (low + high) >> 1;
5256 teodor 2388 ECB :
5256 teodor 2389 GIC 247692 : pos = (low + middle) >> 1;
5256 teodor 2390 CBC 247692 : if (low != middle && pos >= offset && pos - offset < txt->size)
5050 bruce 2391 85164 : insertStatEntry(persistentContext, stat, txt, pos - offset);
5256 teodor 2392 GIC 247692 : pos = (high + middle + 1) >> 1;
2393 247692 : if (middle + 1 != high && pos >= offset && pos - offset < txt->size)
5050 bruce 2394 84642 : insertStatEntry(persistentContext, stat, txt, pos - offset);
2395 :
5256 teodor 2396 CBC 247692 : if (low != middle)
5256 teodor 2397 GIC 123846 : chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset);
5256 teodor 2398 GBC 247692 : if (high != middle + 1)
2399 120840 : chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset);
5710 tgl 2400 247692 : }
2401 :
2402 : /*
5689 teodor 2403 ECB : * This is written like a custom aggregate function, because the
2404 : * original plan was to do just that. Unfortunately, an aggregate function
2405 : * can't return a set, so that plan was abandoned. If that limitation is
5624 bruce 2406 : * lifted in the future, ts_stat could be a real aggregate function so that
5689 teodor 2407 : * you could use it like this:
2408 : *
2409 : * SELECT ts_stat(vector_column) FROM vector_table;
2410 : *
2411 : * where vector_column is a tsvector-type column in vector_table.
2412 : */
2413 :
5256 2414 : static TSVectorStat *
5256 teodor 2415 GIC 3054 : ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data)
2416 : {
5050 bruce 2417 3054 : TSVector txt = DatumGetTSVector(data);
5050 bruce 2418 EUB : uint32 i,
5050 bruce 2419 GIC 3054 : nbit = 0,
2420 : offset;
2421 :
5710 tgl 2422 3054 : if (stat == NULL)
2423 : { /* Init in first */
5256 teodor 2424 UBC 0 : stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
5256 teodor 2425 UIC 0 : stat->maxdepth = 1;
5710 tgl 2426 EUB : }
2427 :
2428 : /* simple check of correctness */
5710 tgl 2429 GBC 3054 : if (txt == NULL || txt->size == 0)
2430 : {
5256 teodor 2431 GIC 48 : if (txt && txt != (TSVector) DatumGetPointer(data))
5710 tgl 2432 48 : pfree(txt);
5710 tgl 2433 GBC 48 : return stat;
5710 tgl 2434 EUB : }
2435 :
5256 teodor 2436 GBC 3006 : i = txt->size - 1;
5256 teodor 2437 GIC 21360 : for (; i > 0; i >>= 1)
2438 18354 : nbit++;
2439 :
5256 teodor 2440 GBC 3006 : nbit = 1 << nbit;
5256 teodor 2441 GIC 3006 : offset = (nbit - txt->size) / 2;
2442 :
5050 bruce 2443 GBC 3006 : insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset);
5256 teodor 2444 GIC 3006 : chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset);
2445 :
5256 teodor 2446 GBC 3006 : return stat;
2447 : }
2448 :
5710 tgl 2449 EUB : static void
5710 tgl 2450 GIC 6 : ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
2451 : TSVectorStat *stat)
2452 : {
5050 bruce 2453 EUB : TupleDesc tupdesc;
2454 : MemoryContext oldcontext;
2455 : StatEntry *node;
5256 teodor 2456 :
5256 teodor 2457 GIC 6 : funcctx->user_fctx = (void *) stat;
2458 :
5710 tgl 2459 6 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2460 :
5256 teodor 2461 6 : stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1));
5050 bruce 2462 6 : stat->stackpos = 0;
2463 :
5256 teodor 2464 6 : node = stat->root;
2465 : /* find leftmost value */
4926 tgl 2466 6 : if (node == NULL)
4926 tgl 2467 UIC 0 : stat->stack[stat->stackpos] = NULL;
2468 : else
4926 tgl 2469 ECB : for (;;)
2470 : {
4926 tgl 2471 CBC 24 : stat->stack[stat->stackpos] = node;
2472 24 : if (node->left)
4926 tgl 2473 ECB : {
4926 tgl 2474 GIC 18 : stat->stackpos++;
4926 tgl 2475 CBC 18 : node = node->left;
2476 : }
4926 tgl 2477 ECB : else
4926 tgl 2478 CBC 6 : break;
5256 teodor 2479 ECB : }
4926 tgl 2480 GIC 6 : Assert(stat->stackpos <= stat->maxdepth);
5710 tgl 2481 ECB :
109 michael 2482 GNC 6 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
109 michael 2483 UNC 0 : elog(ERROR, "return type must be a row type");
109 michael 2484 GNC 6 : funcctx->tuple_desc = tupdesc;
5710 tgl 2485 CBC 6 : funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
2486 :
2487 6 : MemoryContextSwitchTo(oldcontext);
2488 6 : }
5710 tgl 2489 ECB :
2490 : static StatEntry *
5050 bruce 2491 CBC 6864 : walkStatEntryTree(TSVectorStat *stat)
5256 teodor 2492 ECB : {
5050 bruce 2493 GIC 6864 : StatEntry *node = stat->stack[stat->stackpos];
5256 teodor 2494 ECB :
5050 bruce 2495 CBC 6864 : if (node == NULL)
5256 teodor 2496 UIC 0 : return NULL;
5256 teodor 2497 ECB :
5050 bruce 2498 GIC 6864 : if (node->ndoc != 0)
5256 teodor 2499 ECB : {
2500 : /* return entry itself: we already was at left sublink */
5256 teodor 2501 GIC 1696 : return node;
5256 teodor 2502 ECB : }
5256 teodor 2503 GIC 5168 : else if (node->right && node->right != stat->stack[stat->stackpos + 1])
5256 teodor 2504 ECB : {
2505 : /* go on right sublink */
5256 teodor 2506 CBC 1736 : stat->stackpos++;
5256 teodor 2507 GIC 1736 : node = node->right;
5256 teodor 2508 ECB :
2509 : /* find most-left value */
2510 : for (;;)
2511 : {
5256 teodor 2512 CBC 3408 : stat->stack[stat->stackpos] = node;
2513 3408 : if (node->left)
2514 : {
2515 1672 : stat->stackpos++;
5256 teodor 2516 GIC 1672 : node = node->left;
2517 : }
5256 teodor 2518 ECB : else
5256 teodor 2519 CBC 1736 : break;
2520 : }
4926 tgl 2521 1736 : Assert(stat->stackpos <= stat->maxdepth);
2522 : }
5256 teodor 2523 ECB : else
2524 : {
2525 : /* we already return all left subtree, itself and right subtree */
5256 teodor 2526 CBC 3432 : if (stat->stackpos == 0)
2527 6 : return NULL;
5256 teodor 2528 ECB :
5256 teodor 2529 GIC 3426 : stat->stackpos--;
5256 teodor 2530 CBC 3426 : return walkStatEntryTree(stat);
2531 : }
5256 teodor 2532 ECB :
5256 teodor 2533 GIC 1736 : return node;
2534 : }
2535 :
5710 tgl 2536 ECB : static Datum
5710 tgl 2537 CBC 3438 : ts_process_call(FuncCallContext *funcctx)
2538 : {
5050 bruce 2539 ECB : TSVectorStat *st;
2540 : StatEntry *entry;
2541 :
5256 teodor 2542 GIC 3438 : st = (TSVectorStat *) funcctx->user_fctx;
2543 :
5256 teodor 2544 CBC 3438 : entry = walkStatEntryTree(st);
5710 tgl 2545 ECB :
5256 teodor 2546 GIC 3438 : if (entry != NULL)
2547 : {
2548 : Datum result;
2549 : char *values[3];
5710 tgl 2550 ECB : char ndoc[16];
2551 : char nentry[16];
2552 : HeapTuple tuple;
2553 :
5256 teodor 2554 CBC 3432 : values[0] = palloc(entry->lenlexeme + 1);
5256 teodor 2555 GIC 3432 : memcpy(values[0], entry->lexeme, entry->lenlexeme);
5256 teodor 2556 CBC 3432 : (values[0])[entry->lenlexeme] = '\0';
5710 tgl 2557 3432 : sprintf(ndoc, "%d", entry->ndoc);
2558 3432 : values[1] = ndoc;
2559 3432 : sprintf(nentry, "%d", entry->nentry);
2560 3432 : values[2] = nentry;
5710 tgl 2561 ECB :
5710 tgl 2562 GIC 3432 : tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
5710 tgl 2563 CBC 3432 : result = HeapTupleGetDatum(tuple);
5710 tgl 2564 ECB :
5710 tgl 2565 CBC 3432 : pfree(values[0]);
5256 teodor 2566 ECB :
2567 : /* mark entry as already visited */
5256 teodor 2568 GIC 3432 : entry->ndoc = 0;
2569 :
5710 tgl 2570 3432 : return result;
2571 : }
2572 :
2573 6 : return (Datum) 0;
2574 : }
2575 :
2576 : static TSVectorStat *
5256 teodor 2577 6 : ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws)
2578 : {
5493 tgl 2579 6 : char *query = text_to_cstring(txt);
2580 : TSVectorStat *stat;
2581 : bool isnull;
5710 tgl 2582 ECB : Portal portal;
2583 : SPIPlanPtr plan;
2584 :
5710 tgl 2585 GIC 6 : if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
5710 tgl 2586 ECB : /* internal error */
5710 tgl 2587 UIC 0 : elog(ERROR, "SPI_prepare(\"%s\") failed", query);
2588 :
5646 tgl 2589 CBC 6 : if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL)
2590 : /* internal error */
5710 tgl 2591 UBC 0 : elog(ERROR, "SPI_cursor_open(\"%s\") failed", query);
5710 tgl 2592 EUB :
5710 tgl 2593 GIC 6 : SPI_cursor_fetch(portal, true, 100);
2594 :
5646 2595 6 : if (SPI_tuptable == NULL ||
5646 tgl 2596 CBC 6 : SPI_tuptable->tupdesc->natts != 1 ||
2761 teodor 2597 GIC 6 : !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1),
2495 rhaas 2598 ECB : TSVECTOROID))
5710 tgl 2599 LBC 0 : ereport(ERROR,
5710 tgl 2600 ECB : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2601 : errmsg("ts_stat query must return one tsvector column")));
2602 :
5256 teodor 2603 CBC 6 : stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat));
2604 6 : stat->maxdepth = 1;
5710 tgl 2605 ECB :
5710 tgl 2606 GIC 6 : if (ws)
5710 tgl 2607 ECB : {
2608 : char *buf;
2609 :
2219 noah 2610 CBC 3 : buf = VARDATA_ANY(ws);
2611 9 : while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws))
2612 : {
5710 tgl 2613 6 : if (pg_mblen(buf) == 1)
2614 : {
5710 tgl 2615 GIC 6 : switch (*buf)
2616 : {
5710 tgl 2617 CBC 3 : case 'A':
2618 : case 'a':
5710 tgl 2619 GIC 3 : stat->weight |= 1 << 3;
2620 3 : break;
2621 3 : case 'B':
2622 : case 'b':
2623 3 : stat->weight |= 1 << 2;
5710 tgl 2624 CBC 3 : break;
5710 tgl 2625 UIC 0 : case 'C':
5710 tgl 2626 ECB : case 'c':
5710 tgl 2627 UIC 0 : stat->weight |= 1 << 1;
5710 tgl 2628 LBC 0 : break;
2629 0 : case 'D':
2630 : case 'd':
2631 0 : stat->weight |= 1;
5710 tgl 2632 UIC 0 : break;
5710 tgl 2633 LBC 0 : default:
5710 tgl 2634 UBC 0 : stat->weight |= 0;
2635 : }
2636 : }
5710 tgl 2637 GIC 6 : buf += pg_mblen(buf);
5710 tgl 2638 ECB : }
2639 : }
2640 :
5710 tgl 2641 CBC 42 : while (SPI_processed > 0)
5710 tgl 2642 ECB : {
2643 : uint64 i;
2644 :
5710 tgl 2645 CBC 3090 : for (i = 0; i < SPI_processed; i++)
2646 : {
2647 3054 : Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
2648 :
2649 3054 : if (!isnull)
5256 teodor 2650 GBC 3054 : stat = ts_accum(persistentContext, stat, data);
5710 tgl 2651 ECB : }
2652 :
5710 tgl 2653 GIC 36 : SPI_freetuptable(SPI_tuptable);
5710 tgl 2654 CBC 36 : SPI_cursor_fetch(portal, true, 100);
5710 tgl 2655 ECB : }
2656 :
5710 tgl 2657 GIC 6 : SPI_freetuptable(SPI_tuptable);
5710 tgl 2658 CBC 6 : SPI_cursor_close(portal);
5710 tgl 2659 GIC 6 : SPI_freeplan(plan);
5710 tgl 2660 CBC 6 : pfree(query);
2661 :
2662 6 : return stat;
5710 tgl 2663 EUB : }
2664 :
5710 tgl 2665 ECB : Datum
5710 tgl 2666 GIC 3432 : ts_stat1(PG_FUNCTION_ARGS)
2667 : {
5710 tgl 2668 ECB : FuncCallContext *funcctx;
2669 : Datum result;
2670 :
5710 tgl 2671 GIC 3432 : if (SRF_IS_FIRSTCALL())
2672 : {
5050 bruce 2673 ECB : TSVectorStat *stat;
2219 noah 2674 CBC 3 : text *txt = PG_GETARG_TEXT_PP(0);
2675 :
5710 tgl 2676 GIC 3 : funcctx = SRF_FIRSTCALL_INIT();
2677 3 : SPI_connect();
5256 teodor 2678 3 : stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL);
5710 tgl 2679 CBC 3 : PG_FREE_IF_COPY(txt, 0);
2680 3 : ts_setup_firstcall(fcinfo, funcctx, stat);
5710 tgl 2681 GIC 3 : SPI_finish();
5710 tgl 2682 ECB : }
2683 :
5710 tgl 2684 GIC 3432 : funcctx = SRF_PERCALL_SETUP();
2685 3432 : if ((result = ts_process_call(funcctx)) != (Datum) 0)
5710 tgl 2686 CBC 3429 : SRF_RETURN_NEXT(funcctx, result);
5710 tgl 2687 GIC 3 : SRF_RETURN_DONE(funcctx);
5710 tgl 2688 ECB : }
2689 :
2690 : Datum
5710 tgl 2691 GIC 6 : ts_stat2(PG_FUNCTION_ARGS)
2692 : {
5710 tgl 2693 ECB : FuncCallContext *funcctx;
2694 : Datum result;
2695 :
5710 tgl 2696 CBC 6 : if (SRF_IS_FIRSTCALL())
5710 tgl 2697 ECB : {
2698 : TSVectorStat *stat;
2219 noah 2699 GIC 3 : text *txt = PG_GETARG_TEXT_PP(0);
2219 noah 2700 CBC 3 : text *ws = PG_GETARG_TEXT_PP(1);
2701 :
5710 tgl 2702 GIC 3 : funcctx = SRF_FIRSTCALL_INIT();
2703 3 : SPI_connect();
5256 teodor 2704 CBC 3 : stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws);
5710 tgl 2705 GIC 3 : PG_FREE_IF_COPY(txt, 0);
2706 3 : PG_FREE_IF_COPY(ws, 1);
2707 3 : ts_setup_firstcall(fcinfo, funcctx, stat);
2708 3 : SPI_finish();
5710 tgl 2709 ECB : }
2710 :
5710 tgl 2711 CBC 6 : funcctx = SRF_PERCALL_SETUP();
5710 tgl 2712 GIC 6 : if ((result = ts_process_call(funcctx)) != (Datum) 0)
5710 tgl 2713 CBC 3 : SRF_RETURN_NEXT(funcctx, result);
5710 tgl 2714 GIC 3 : SRF_RETURN_DONE(funcctx);
2715 : }
2716 :
2717 :
2718 : /*
2719 : * Triggers for automatic update of a tsvector column from text column(s)
2720 : *
5710 tgl 2721 ECB : * Trigger arguments are either
2722 : * name of tsvector col, name of tsconfig to use, name(s) of text col(s)
2723 : * name of tsvector col, name of regconfig col, name(s) of text col(s)
2724 : * ie, tsconfig can either be specified by name, or indirectly as the
2725 : * contents of a regconfig field in the row. If the name is used, it must
2726 : * be explicitly schema-qualified.
2727 : */
2728 : Datum
5710 tgl 2729 CBC 9 : tsvector_update_trigger_byid(PG_FUNCTION_ARGS)
5710 tgl 2730 ECB : {
5710 tgl 2731 GIC 9 : return tsvector_update_trigger(fcinfo, false);
5710 tgl 2732 ECB : }
2733 :
2734 : Datum
5710 tgl 2735 LBC 0 : tsvector_update_trigger_bycolumn(PG_FUNCTION_ARGS)
2736 : {
2737 0 : return tsvector_update_trigger(fcinfo, true);
2738 : }
2739 :
5710 tgl 2740 ECB : static Datum
5710 tgl 2741 GIC 9 : tsvector_update_trigger(PG_FUNCTION_ARGS, bool config_column)
2742 : {
2743 : TriggerData *trigdata;
5710 tgl 2744 ECB : Trigger *trigger;
2745 : Relation rel;
5710 tgl 2746 CBC 9 : HeapTuple rettuple = NULL;
2747 : int tsvector_attr_num,
2748 : i;
2749 : ParsedText prs;
2750 : Datum datum;
2751 : bool isnull;
5710 tgl 2752 ECB : text *txt;
2753 : Oid cfgId;
1126 peter 2754 EUB : bool update_needed;
2755 :
5710 tgl 2756 ECB : /* Check call context */
2118 tgl 2757 GIC 9 : if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */
5710 tgl 2758 UBC 0 : elog(ERROR, "tsvector_update_trigger: not fired by trigger manager");
2759 :
5710 tgl 2760 CBC 9 : trigdata = (TriggerData *) fcinfo->context;
4566 tgl 2761 GIC 9 : if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event))
4566 tgl 2762 LBC 0 : elog(ERROR, "tsvector_update_trigger: must be fired for row");
4566 tgl 2763 CBC 9 : if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event))
5710 tgl 2764 LBC 0 : elog(ERROR, "tsvector_update_trigger: must be fired BEFORE event");
2765 :
5710 tgl 2766 GBC 9 : if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
2767 : {
5710 tgl 2768 GIC 6 : rettuple = trigdata->tg_trigtuple;
1126 peter 2769 6 : update_needed = true;
1126 peter 2770 ECB : }
5710 tgl 2771 CBC 3 : else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
2772 : {
2773 3 : rettuple = trigdata->tg_newtuple;
1126 peter 2774 GIC 3 : update_needed = false; /* computed below */
2775 : }
2776 : else
5710 tgl 2777 LBC 0 : elog(ERROR, "tsvector_update_trigger: must be fired for INSERT or UPDATE");
5710 tgl 2778 ECB :
5710 tgl 2779 GIC 9 : trigger = trigdata->tg_trigger;
5710 tgl 2780 CBC 9 : rel = trigdata->tg_relation;
2781 :
2782 9 : if (trigger->tgnargs < 3)
5710 tgl 2783 UIC 0 : elog(ERROR, "tsvector_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)");
5710 tgl 2784 ECB :
2785 : /* Find the target tsvector column */
5710 tgl 2786 CBC 9 : tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
2787 9 : if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE)
5710 tgl 2788 LBC 0 : ereport(ERROR,
2789 : (errcode(ERRCODE_UNDEFINED_COLUMN),
5710 tgl 2790 ECB : errmsg("tsvector column \"%s\" does not exist",
2791 : trigger->tgargs[0])));
2343 tgl 2792 EUB : /* This will effectively reject system columns, so no separate test: */
2761 teodor 2793 GIC 9 : if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, tsvector_attr_num),
2495 rhaas 2794 EUB : TSVECTOROID))
5710 tgl 2795 UBC 0 : ereport(ERROR,
5710 tgl 2796 EUB : (errcode(ERRCODE_DATATYPE_MISMATCH),
2797 : errmsg("column \"%s\" is not of tsvector type",
2798 : trigger->tgargs[0])));
2799 :
2800 : /* Find the configuration to use */
5710 tgl 2801 GBC 9 : if (config_column)
2802 : {
2803 : int config_attr_num;
5710 tgl 2804 ECB :
5710 tgl 2805 UIC 0 : config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]);
2806 0 : if (config_attr_num == SPI_ERROR_NOATTRIBUTE)
2807 0 : ereport(ERROR,
5710 tgl 2808 ECB : (errcode(ERRCODE_UNDEFINED_COLUMN),
2809 : errmsg("configuration column \"%s\" does not exist",
2810 : trigger->tgargs[1])));
2761 teodor 2811 UIC 0 : if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num),
2495 rhaas 2812 ECB : REGCONFIGOID))
5710 tgl 2813 UIC 0 : ereport(ERROR,
5710 tgl 2814 ECB : (errcode(ERRCODE_DATATYPE_MISMATCH),
2815 : errmsg("column \"%s\" is not of regconfig type",
2816 : trigger->tgargs[1])));
2817 :
5710 tgl 2818 UIC 0 : datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull);
2819 0 : if (isnull)
5710 tgl 2820 LBC 0 : ereport(ERROR,
5710 tgl 2821 ECB : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
2822 : errmsg("configuration column \"%s\" must not be null",
2823 : trigger->tgargs[1])));
5710 tgl 2824 LBC 0 : cfgId = DatumGetObjectId(datum);
5710 tgl 2825 ECB : }
2826 : else
2827 : {
2828 : List *names;
2829 :
103 tgl 2830 GNC 9 : names = stringToQualifiedNameList(trigger->tgargs[1], NULL);
2831 : /* require a schema so that results are not search path dependent */
5710 tgl 2832 GIC 9 : if (list_length(names) < 2)
5710 tgl 2833 LBC 0 : ereport(ERROR,
2834 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2835 : errmsg("text search configuration name \"%s\" must be schema-qualified",
2836 : trigger->tgargs[1])));
4630 rhaas 2837 GIC 9 : cfgId = get_ts_config_oid(names, false);
5710 tgl 2838 ECB : }
2839 :
2840 : /* initialize parse state */
5710 tgl 2841 CBC 9 : prs.lenwords = 32;
5710 tgl 2842 GIC 9 : prs.curwords = 0;
5710 tgl 2843 CBC 9 : prs.pos = 0;
2844 9 : prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
5710 tgl 2845 ECB :
2846 : /* find all words in indexable column(s) */
5710 tgl 2847 CBC 18 : for (i = 2; i < trigger->tgnargs; i++)
5710 tgl 2848 ECB : {
2849 : int numattr;
2850 :
5710 tgl 2851 CBC 9 : numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
2852 9 : if (numattr == SPI_ERROR_NOATTRIBUTE)
5710 tgl 2853 LBC 0 : ereport(ERROR,
5710 tgl 2854 ECB : (errcode(ERRCODE_UNDEFINED_COLUMN),
2855 : errmsg("column \"%s\" does not exist",
2856 : trigger->tgargs[i])));
2761 teodor 2857 GIC 9 : if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID))
5710 tgl 2858 LBC 0 : ereport(ERROR,
2859 : (errcode(ERRCODE_DATATYPE_MISMATCH),
2860 : errmsg("column \"%s\" is not of a character type",
2861 : trigger->tgargs[i])));
2862 :
1126 peter 2863 CBC 9 : if (bms_is_member(numattr - FirstLowInvalidHeapAttributeNumber, trigdata->tg_updatedcols))
1126 peter 2864 GIC 3 : update_needed = true;
2865 :
5710 tgl 2866 CBC 9 : datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
2867 9 : if (isnull)
5710 tgl 2868 GIC 3 : continue;
5710 tgl 2869 ECB :
2219 noah 2870 CBC 6 : txt = DatumGetTextPP(datum);
5710 tgl 2871 ECB :
2219 noah 2872 CBC 6 : parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt));
5710 tgl 2873 ECB :
5710 tgl 2874 CBC 6 : if (txt != (text *) DatumGetPointer(datum))
5710 tgl 2875 LBC 0 : pfree(txt);
2876 : }
2877 :
1126 peter 2878 CBC 9 : if (update_needed)
1126 peter 2879 ECB : {
2880 : /* make tsvector value */
1126 peter 2881 CBC 9 : datum = TSVectorGetDatum(make_tsvector(&prs));
1126 peter 2882 GIC 9 : isnull = false;
2883 :
2884 : /* and insert it into tuple */
2885 9 : rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att,
2886 : 1, &tsvector_attr_num,
2887 : &datum, &isnull);
2888 :
2889 9 : pfree(DatumGetPointer(datum));
2890 : }
2891 :
5710 tgl 2892 9 : return PointerGetDatum(rettuple);
2893 : }
|