Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsquery_cleanup.c
4 : * Cleanup query from NOT values and/or stopword
5 : * Utility functions to correct work.
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/tsquery_cleanup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "miscadmin.h"
19 : #include "tsearch/ts_utils.h"
20 : #include "varatt.h"
21 :
22 : typedef struct NODE
23 : {
24 : struct NODE *left;
25 : struct NODE *right;
26 : QueryItem *valnode;
27 : } NODE;
28 :
29 : /*
30 : * make query tree from plain view of query
31 : */
32 : static NODE *
5624 bruce 33 GIC 1530 : maketree(QueryItem *in)
5710 tgl 34 ECB : {
5710 tgl 35 GIC 1530 : NODE *node = (NODE *) palloc(sizeof(NODE));
5710 tgl 36 ECB :
37 : /* since this function recurses, it could be driven to stack overflow. */
2743 noah 38 GIC 1530 : check_stack_depth();
2743 noah 39 ECB :
5710 tgl 40 GIC 1530 : node->valnode = in;
5710 tgl 41 CBC 1530 : node->right = node->left = NULL;
5693 teodor 42 1530 : if (in->type == QI_OPR)
5710 tgl 43 ECB : {
5710 tgl 44 GIC 675 : node->right = maketree(in + 1);
5015 peter_e 45 CBC 675 : if (in->qoperator.oper != OP_NOT)
46 633 : node->left = maketree(in + in->qoperator.left);
5710 tgl 47 ECB : }
5710 tgl 48 GIC 1530 : return node;
5710 tgl 49 ECB : }
50 :
51 : /*
52 : * Internal state for plaintree and plainnode
53 : */
54 : typedef struct
55 : {
56 : QueryItem *ptr;
57 : int len; /* allocated size of ptr */
58 : int cur; /* number of elements in ptr */
59 : } PLAINTREE;
60 :
61 : static void
5624 bruce 62 GIC 849 : plainnode(PLAINTREE *state, NODE *node)
5710 tgl 63 ECB : {
64 : /* since this function recurses, it could be driven to stack overflow. */
5693 teodor 65 GIC 849 : check_stack_depth();
5693 teodor 66 ECB :
5710 tgl 67 GIC 849 : if (state->cur == state->len)
5710 tgl 68 ECB : {
5710 tgl 69 UIC 0 : state->len *= 2;
61 peter 70 UNC 0 : state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
5710 tgl 71 EUB : }
61 peter 72 GNC 849 : memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
5693 teodor 73 CBC 849 : if (node->valnode->type == QI_VAL)
5710 tgl 74 513 : state->cur++;
5015 peter_e 75 336 : else if (node->valnode->qoperator.oper == OP_NOT)
5710 tgl 76 ECB : {
5015 peter_e 77 GIC 33 : state->ptr[state->cur].qoperator.left = 1;
5710 tgl 78 CBC 33 : state->cur++;
79 33 : plainnode(state, node->right);
5710 tgl 80 ECB : }
81 : else
82 : {
5624 bruce 83 GIC 303 : int cur = state->cur;
5710 tgl 84 ECB :
5710 tgl 85 GIC 303 : state->cur++;
5710 tgl 86 CBC 303 : plainnode(state, node->right);
5015 peter_e 87 303 : state->ptr[cur].qoperator.left = state->cur - cur;
5710 tgl 88 303 : plainnode(state, node->left);
5710 tgl 89 ECB : }
5710 tgl 90 GIC 849 : pfree(node);
5710 tgl 91 CBC 849 : }
5710 tgl 92 ECB :
93 : /*
94 : * make plain view of tree from a NODE-tree representation
95 : */
96 : static QueryItem *
5624 bruce 97 GIC 210 : plaintree(NODE *root, int *len)
5710 tgl 98 ECB : {
99 : PLAINTREE pl;
100 :
5710 tgl 101 GIC 210 : pl.cur = 0;
5710 tgl 102 CBC 210 : pl.len = 16;
5693 teodor 103 210 : if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
5710 tgl 104 ECB : {
5710 tgl 105 GIC 210 : pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
5710 tgl 106 CBC 210 : plainnode(&pl, root);
5710 tgl 107 ECB : }
108 : else
5710 tgl 109 UIC 0 : pl.ptr = NULL;
5710 tgl 110 GBC 210 : *len = pl.cur;
5710 tgl 111 CBC 210 : return pl.ptr;
5710 tgl 112 ECB : }
113 :
114 : static void
5624 bruce 115 GIC 21 : freetree(NODE *node)
5710 tgl 116 ECB : {
117 : /* since this function recurses, it could be driven to stack overflow. */
5693 teodor 118 GIC 21 : check_stack_depth();
5693 teodor 119 ECB :
5710 tgl 120 GIC 21 : if (!node)
5710 tgl 121 LBC 0 : return;
5710 tgl 122 GBC 21 : if (node->left)
5710 tgl 123 LBC 0 : freetree(node->left);
5710 tgl 124 GBC 21 : if (node->right)
5710 tgl 125 LBC 0 : freetree(node->right);
5710 tgl 126 GBC 21 : pfree(node);
5710 tgl 127 ECB : }
128 :
129 : /*
130 : * clean tree for ! operator.
131 : * It's useful for debug, but in
132 : * other case, such view is used with search in index.
133 : * Operator ! always return TRUE
134 : */
135 : static NODE *
5624 bruce 136 UIC 0 : clean_NOT_intree(NODE *node)
5710 tgl 137 EUB : {
138 : /* since this function recurses, it could be driven to stack overflow. */
5693 teodor 139 UIC 0 : check_stack_depth();
5693 teodor 140 EUB :
5693 teodor 141 UIC 0 : if (node->valnode->type == QI_VAL)
5710 tgl 142 UBC 0 : return node;
5710 tgl 143 EUB :
5015 peter_e 144 UIC 0 : if (node->valnode->qoperator.oper == OP_NOT)
5710 tgl 145 EUB : {
5710 tgl 146 UIC 0 : freetree(node);
5710 tgl 147 UBC 0 : return NULL;
5710 tgl 148 EUB : }
149 :
150 : /* operator & or | */
5015 peter_e 151 UIC 0 : if (node->valnode->qoperator.oper == OP_OR)
5710 tgl 152 EUB : {
5710 tgl 153 UIC 0 : if ((node->left = clean_NOT_intree(node->left)) == NULL ||
5710 tgl 154 UBC 0 : (node->right = clean_NOT_intree(node->right)) == NULL)
5710 tgl 155 EUB : {
5710 tgl 156 UIC 0 : freetree(node);
5710 tgl 157 UBC 0 : return NULL;
5710 tgl 158 EUB : }
159 : }
160 : else
161 : {
5710 tgl 162 UIC 0 : NODE *res = node;
5624 bruce 163 EUB :
2558 teodor 164 UIC 0 : Assert(node->valnode->qoperator.oper == OP_AND ||
2558 teodor 165 EUB : node->valnode->qoperator.oper == OP_PHRASE);
166 :
5710 tgl 167 UIC 0 : node->left = clean_NOT_intree(node->left);
5710 tgl 168 UBC 0 : node->right = clean_NOT_intree(node->right);
169 0 : if (node->left == NULL && node->right == NULL)
5710 tgl 170 EUB : {
5710 tgl 171 UIC 0 : pfree(node);
5710 tgl 172 UBC 0 : res = NULL;
5710 tgl 173 EUB : }
5710 tgl 174 UIC 0 : else if (node->left == NULL)
5710 tgl 175 EUB : {
5710 tgl 176 UIC 0 : res = node->right;
5710 tgl 177 UBC 0 : pfree(node);
5710 tgl 178 EUB : }
5710 tgl 179 UIC 0 : else if (node->right == NULL)
5710 tgl 180 EUB : {
5710 tgl 181 UIC 0 : res = node->left;
5710 tgl 182 UBC 0 : pfree(node);
5710 tgl 183 EUB : }
5710 tgl 184 UIC 0 : return res;
5710 tgl 185 EUB : }
5710 tgl 186 UIC 0 : return node;
5710 tgl 187 EUB : }
188 :
189 : QueryItem *
5624 bruce 190 UIC 0 : clean_NOT(QueryItem *ptr, int *len)
5710 tgl 191 EUB : {
5710 tgl 192 UIC 0 : NODE *root = maketree(ptr);
5710 tgl 193 EUB :
5710 tgl 194 UIC 0 : return plaintree(clean_NOT_intree(root), len);
5710 tgl 195 EUB : }
196 :
197 :
198 : /*
199 : * Remove QI_VALSTOP (stopword) nodes from query tree.
200 : *
201 : * Returns NULL if the query degenerates to nothing. Input must not be NULL.
202 : *
203 : * When we remove a phrase operator due to removing one or both of its
204 : * arguments, we might need to adjust the distance of a parent phrase
205 : * operator. For example, 'a' is a stopword, so:
206 : * (b <-> a) <-> c should become b <2> c
207 : * b <-> (a <-> c) should become b <2> c
208 : * (b <-> (a <-> a)) <-> c should become b <3> c
209 : * b <-> ((a <-> a) <-> c) should become b <3> c
210 : * To handle that, we define two output parameters:
211 : * ladd: amount to add to a phrase distance to the left of this node
212 : * radd: amount to add to a phrase distance to the right of this node
213 : * We need two outputs because we could need to bubble up adjustments to two
214 : * different parent phrase operators. Consider
215 : * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
216 : * After we've removed the two a's and are considering the <2> node (which is
217 : * now just x <2> y), we have an ladd distance of 1 that needs to propagate
218 : * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
219 : * propagate to the rightmost <->, so that we'll end up with
220 : * w <2> ((x <2> y) <4> z)
221 : * Near the bottom of the tree, we may have subtrees consisting only of
222 : * stopwords. The distances of any phrase operators within such a subtree are
223 : * summed and propagated to both ladd and radd, since we don't know which side
224 : * of the lowest surviving phrase operator we are in. The rule is that any
225 : * subtree that degenerates to NULL must return equal values of ladd and radd,
226 : * and the parent node dealing with it should incorporate only one of those.
227 : *
228 : * Currently, we only implement this adjustment for adjacent phrase operators.
229 : * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
230 : * isn't ideal, but there is no way to represent the really desired semantics
231 : * without some redesign of the tsquery structure. Certainly it would not be
232 : * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
233 : * corner case, let it go for now. But we can fix it in cases where the
234 : * intervening non-phrase operator also gets removed, for example
235 : * '((x <-> a) | a) <-> y' will become 'x <2> y'.
236 : */
237 : static NODE *
2302 tgl 238 GIC 1530 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
5710 tgl 239 ECB : {
240 : /* since this function recurses, it could be driven to stack overflow. */
5693 teodor 241 GIC 1530 : check_stack_depth();
5693 teodor 242 ECB :
243 : /* default output parameters indicate no change in parent distance */
2302 tgl 244 GIC 1530 : *ladd = *radd = 0;
2558 teodor 245 ECB :
5693 teodor 246 GIC 1530 : if (node->valnode->type == QI_VAL)
5710 tgl 247 CBC 513 : return node;
5624 bruce 248 1017 : else if (node->valnode->type == QI_VALSTOP)
5710 tgl 249 ECB : {
5710 tgl 250 GIC 342 : pfree(node);
5710 tgl 251 CBC 342 : return NULL;
5710 tgl 252 ECB : }
253 :
5693 teodor 254 GIC 675 : Assert(node->valnode->type == QI_OPR);
5710 tgl 255 ECB :
5015 peter_e 256 GIC 675 : if (node->valnode->qoperator.oper == OP_NOT)
5710 tgl 257 ECB : {
258 : /* NOT doesn't change pattern width, so just report child distances */
2302 tgl 259 GIC 42 : node->right = clean_stopword_intree(node->right, ladd, radd);
5710 tgl 260 CBC 42 : if (!node->right)
5710 tgl 261 ECB : {
5710 tgl 262 GIC 9 : freetree(node);
5710 tgl 263 CBC 9 : return NULL;
5710 tgl 264 ECB : }
265 : }
266 : else
267 : {
2495 rhaas 268 GIC 633 : NODE *res = node;
2302 tgl 269 ECB : bool isphrase;
270 : int ndistance,
271 : lladd,
272 : lradd,
273 : rladd,
274 : rradd;
275 :
276 : /* First, recurse */
2302 tgl 277 GIC 633 : node->left = clean_stopword_intree(node->left, &lladd, &lradd);
2302 tgl 278 CBC 633 : node->right = clean_stopword_intree(node->right, &rladd, &rradd);
5710 tgl 279 ECB :
280 : /* Check if current node is OP_PHRASE, get its distance */
2302 tgl 281 GIC 633 : isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
2302 tgl 282 CBC 633 : ndistance = isphrase ? node->valnode->qoperator.distance : 0;
2558 teodor 283 ECB :
2302 tgl 284 GIC 633 : if (node->left == NULL && node->right == NULL)
5710 tgl 285 ECB : {
286 : /*
287 : * When we collapse out a phrase node entirely, propagate its own
288 : * distance into both *ladd and *radd; it is the responsibility of
289 : * the parent node to count it only once. Also, for a phrase
290 : * node, distances coming from children are summed and propagated
291 : * up to parent (we assume lladd == lradd and rladd == rradd, else
292 : * rule was broken at a lower level). But if this isn't a phrase
293 : * node, take the larger of the two child distances; that
294 : * corresponds to what TS_execute will do in non-stopword cases.
295 : */
2302 tgl 296 GIC 12 : if (isphrase)
2302 tgl 297 LBC 0 : *ladd = *radd = lladd + ndistance + rladd;
2302 tgl 298 EUB : else
2302 tgl 299 GIC 12 : *ladd = *radd = Max(lladd, rladd);
5710 tgl 300 CBC 12 : freetree(node);
301 12 : return NULL;
5710 tgl 302 ECB : }
2302 tgl 303 GIC 621 : else if (node->left == NULL)
5710 tgl 304 ECB : {
305 : /* Removing this operator and left subnode */
306 : /* lladd and lradd are equal/redundant, don't count both */
2302 tgl 307 GIC 126 : if (isphrase)
2302 tgl 308 ECB : {
309 : /* operator's own distance must propagate to left */
2302 tgl 310 GIC 81 : *ladd = lladd + ndistance + rladd;
2302 tgl 311 CBC 81 : *radd = rradd;
2302 tgl 312 ECB : }
313 : else
314 : {
315 : /* at non-phrase op, just forget the left subnode entirely */
2302 tgl 316 GIC 45 : *ladd = rladd;
2302 tgl 317 CBC 45 : *radd = rradd;
2302 tgl 318 ECB : }
5710 tgl 319 GIC 126 : res = node->right;
5710 tgl 320 CBC 126 : pfree(node);
5710 tgl 321 ECB : }
2302 tgl 322 GIC 495 : else if (node->right == NULL)
5710 tgl 323 ECB : {
324 : /* Removing this operator and right subnode */
325 : /* rladd and rradd are equal/redundant, don't count both */
2302 tgl 326 GIC 192 : if (isphrase)
2302 tgl 327 ECB : {
328 : /* operator's own distance must propagate to right */
2302 tgl 329 GIC 108 : *ladd = lladd;
2302 tgl 330 CBC 108 : *radd = lradd + ndistance + rradd;
2302 tgl 331 ECB : }
332 : else
333 : {
334 : /* at non-phrase op, just forget the right subnode entirely */
2302 tgl 335 GIC 84 : *ladd = lladd;
2302 tgl 336 CBC 84 : *radd = lradd;
2302 tgl 337 ECB : }
5710 tgl 338 GIC 192 : res = node->left;
5710 tgl 339 CBC 192 : pfree(node);
5710 tgl 340 ECB : }
2302 tgl 341 GIC 303 : else if (isphrase)
2558 teodor 342 ECB : {
343 : /* Absorb appropriate corrections at this level */
2302 tgl 344 GIC 171 : node->valnode->qoperator.distance += lradd + rladd;
2302 tgl 345 ECB : /* Propagate up any unaccounted-for corrections */
2302 tgl 346 GIC 171 : *ladd = lladd;
2302 tgl 347 CBC 171 : *radd = rradd;
2558 teodor 348 ECB : }
349 : else
350 : {
351 : /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
352 : }
353 :
5710 tgl 354 GIC 621 : return res;
5710 tgl 355 ECB : }
5710 tgl 356 GIC 33 : return node;
5710 tgl 357 ECB : }
358 :
359 : /*
360 : * Number of elements in query tree
361 : */
362 : static int32
2558 teodor 363 GIC 849 : calcstrlen(NODE *node)
2558 teodor 364 ECB : {
2495 rhaas 365 GIC 849 : int32 size = 0;
2558 teodor 366 ECB :
2558 teodor 367 GIC 849 : if (node->valnode->type == QI_VAL)
2558 teodor 368 ECB : {
2558 teodor 369 GIC 513 : size = node->valnode->qoperand.length + 1;
2558 teodor 370 ECB : }
371 : else
372 : {
2558 teodor 373 GIC 336 : Assert(node->valnode->type == QI_OPR);
2558 teodor 374 ECB :
2558 teodor 375 GIC 336 : size = calcstrlen(node->right);
2558 teodor 376 CBC 336 : if (node->valnode->qoperator.oper != OP_NOT)
377 303 : size += calcstrlen(node->left);
2558 teodor 378 ECB : }
379 :
2558 teodor 380 GIC 849 : return size;
2558 teodor 381 ECB : }
382 :
383 : /*
384 : * Remove QI_VALSTOP (stopword) nodes from TSQuery.
385 : */
386 : TSQuery
103 tgl 387 GNC 222 : cleanup_tsquery_stopwords(TSQuery in, bool noisy)
2558 teodor 388 ECB : {
389 : int32 len,
390 : lenstr,
391 : commonlen,
392 : i;
393 : NODE *root;
394 : int ladd,
395 : radd;
396 : TSQuery out;
397 : QueryItem *items;
398 : char *operands;
399 :
2558 teodor 400 GIC 222 : if (in->size == 0)
2558 teodor 401 LBC 0 : return in;
2558 teodor 402 EUB :
403 : /* eliminate stop words */
2302 tgl 404 GIC 222 : root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
2302 tgl 405 CBC 222 : if (root == NULL)
5710 tgl 406 ECB : {
103 tgl 407 GNC 12 : if (noisy)
408 12 : ereport(NOTICE,
409 : (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
2558 teodor 410 CBC 12 : out = palloc(HDRSIZETQ);
2558 teodor 411 GIC 12 : out->size = 0;
2558 teodor 412 CBC 12 : SET_VARSIZE(out, HDRSIZETQ);
413 12 : return out;
2558 teodor 414 ECB : }
415 :
416 : /*
417 : * Build TSQuery from plain view
418 : */
419 :
2558 teodor 420 GIC 210 : lenstr = calcstrlen(root);
421 210 : items = plaintree(root, &len);
2558 teodor 422 CBC 210 : commonlen = COMPUTESIZE(len, lenstr);
2558 teodor 423 ECB :
2558 teodor 424 CBC 210 : out = palloc(commonlen);
2558 teodor 425 GIC 210 : SET_VARSIZE(out, commonlen);
2558 teodor 426 CBC 210 : out->size = len;
2558 teodor 427 ECB :
2558 teodor 428 CBC 210 : memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
429 :
430 210 : items = GETQUERY(out);
2558 teodor 431 GIC 210 : operands = GETOPERAND(out);
2558 teodor 432 CBC 1059 : for (i = 0; i < out->size; i++)
2558 teodor 433 ECB : {
2558 teodor 434 CBC 849 : QueryOperand *op = (QueryOperand *) &items[i];
435 :
436 849 : if (op->type != QI_VAL)
2558 teodor 437 GIC 336 : continue;
2558 teodor 438 ECB :
2558 teodor 439 CBC 513 : memcpy(operands, GETOPERAND(in) + op->distance, op->length);
2558 teodor 440 GIC 513 : operands[op->length] = '\0';
2558 teodor 441 CBC 513 : op->distance = operands - GETOPERAND(out);
442 513 : operands += op->length + 1;
5710 tgl 443 ECB : }
444 :
2558 teodor 445 GIC 210 : return out;
446 : }
|