TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsquery_cleanup.c
4 : * Cleanup query from NOT values and/or stopword
5 : * Utility functions to correct work.
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/tsquery_cleanup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "miscadmin.h"
19 : #include "tsearch/ts_utils.h"
20 : #include "varatt.h"
21 :
22 : typedef struct NODE
23 : {
24 : struct NODE *left;
25 : struct NODE *right;
26 : QueryItem *valnode;
27 : } NODE;
28 :
29 : /*
30 : * make query tree from plain view of query
31 : */
32 : static NODE *
33 GIC 1530 : maketree(QueryItem *in)
34 ECB : {
35 GIC 1530 : NODE *node = (NODE *) palloc(sizeof(NODE));
36 ECB :
37 : /* since this function recurses, it could be driven to stack overflow. */
38 GIC 1530 : check_stack_depth();
39 ECB :
40 GIC 1530 : node->valnode = in;
41 CBC 1530 : node->right = node->left = NULL;
42 1530 : if (in->type == QI_OPR)
43 ECB : {
44 GIC 675 : node->right = maketree(in + 1);
45 CBC 675 : if (in->qoperator.oper != OP_NOT)
46 633 : node->left = maketree(in + in->qoperator.left);
47 ECB : }
48 GIC 1530 : return node;
49 ECB : }
50 :
51 : /*
52 : * Internal state for plaintree and plainnode
53 : */
54 : typedef struct
55 : {
56 : QueryItem *ptr;
57 : int len; /* allocated size of ptr */
58 : int cur; /* number of elements in ptr */
59 : } PLAINTREE;
60 :
61 : static void
62 GIC 849 : plainnode(PLAINTREE *state, NODE *node)
63 ECB : {
64 : /* since this function recurses, it could be driven to stack overflow. */
65 GIC 849 : check_stack_depth();
66 ECB :
67 GIC 849 : if (state->cur == state->len)
68 ECB : {
69 UIC 0 : state->len *= 2;
70 UNC 0 : state->ptr = (QueryItem *) repalloc(state->ptr, state->len * sizeof(QueryItem));
71 EUB : }
72 GNC 849 : memcpy(&(state->ptr[state->cur]), node->valnode, sizeof(QueryItem));
73 CBC 849 : if (node->valnode->type == QI_VAL)
74 513 : state->cur++;
75 336 : else if (node->valnode->qoperator.oper == OP_NOT)
76 ECB : {
77 GIC 33 : state->ptr[state->cur].qoperator.left = 1;
78 CBC 33 : state->cur++;
79 33 : plainnode(state, node->right);
80 ECB : }
81 : else
82 : {
83 GIC 303 : int cur = state->cur;
84 ECB :
85 GIC 303 : state->cur++;
86 CBC 303 : plainnode(state, node->right);
87 303 : state->ptr[cur].qoperator.left = state->cur - cur;
88 303 : plainnode(state, node->left);
89 ECB : }
90 GIC 849 : pfree(node);
91 CBC 849 : }
92 ECB :
93 : /*
94 : * make plain view of tree from a NODE-tree representation
95 : */
96 : static QueryItem *
97 GIC 210 : plaintree(NODE *root, int *len)
98 ECB : {
99 : PLAINTREE pl;
100 :
101 GIC 210 : pl.cur = 0;
102 CBC 210 : pl.len = 16;
103 210 : if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
104 ECB : {
105 GIC 210 : pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
106 CBC 210 : plainnode(&pl, root);
107 ECB : }
108 : else
109 UIC 0 : pl.ptr = NULL;
110 GBC 210 : *len = pl.cur;
111 CBC 210 : return pl.ptr;
112 ECB : }
113 :
114 : static void
115 GIC 21 : freetree(NODE *node)
116 ECB : {
117 : /* since this function recurses, it could be driven to stack overflow. */
118 GIC 21 : check_stack_depth();
119 ECB :
120 GIC 21 : if (!node)
121 LBC 0 : return;
122 GBC 21 : if (node->left)
123 LBC 0 : freetree(node->left);
124 GBC 21 : if (node->right)
125 LBC 0 : freetree(node->right);
126 GBC 21 : pfree(node);
127 ECB : }
128 :
129 : /*
130 : * clean tree for ! operator.
131 : * It's useful for debug, but in
132 : * other case, such view is used with search in index.
133 : * Operator ! always return TRUE
134 : */
135 : static NODE *
136 UIC 0 : clean_NOT_intree(NODE *node)
137 EUB : {
138 : /* since this function recurses, it could be driven to stack overflow. */
139 UIC 0 : check_stack_depth();
140 EUB :
141 UIC 0 : if (node->valnode->type == QI_VAL)
142 UBC 0 : return node;
143 EUB :
144 UIC 0 : if (node->valnode->qoperator.oper == OP_NOT)
145 EUB : {
146 UIC 0 : freetree(node);
147 UBC 0 : return NULL;
148 EUB : }
149 :
150 : /* operator & or | */
151 UIC 0 : if (node->valnode->qoperator.oper == OP_OR)
152 EUB : {
153 UIC 0 : if ((node->left = clean_NOT_intree(node->left)) == NULL ||
154 UBC 0 : (node->right = clean_NOT_intree(node->right)) == NULL)
155 EUB : {
156 UIC 0 : freetree(node);
157 UBC 0 : return NULL;
158 EUB : }
159 : }
160 : else
161 : {
162 UIC 0 : NODE *res = node;
163 EUB :
164 UIC 0 : Assert(node->valnode->qoperator.oper == OP_AND ||
165 EUB : node->valnode->qoperator.oper == OP_PHRASE);
166 :
167 UIC 0 : node->left = clean_NOT_intree(node->left);
168 UBC 0 : node->right = clean_NOT_intree(node->right);
169 0 : if (node->left == NULL && node->right == NULL)
170 EUB : {
171 UIC 0 : pfree(node);
172 UBC 0 : res = NULL;
173 EUB : }
174 UIC 0 : else if (node->left == NULL)
175 EUB : {
176 UIC 0 : res = node->right;
177 UBC 0 : pfree(node);
178 EUB : }
179 UIC 0 : else if (node->right == NULL)
180 EUB : {
181 UIC 0 : res = node->left;
182 UBC 0 : pfree(node);
183 EUB : }
184 UIC 0 : return res;
185 EUB : }
186 UIC 0 : return node;
187 EUB : }
188 :
189 : QueryItem *
190 UIC 0 : clean_NOT(QueryItem *ptr, int *len)
191 EUB : {
192 UIC 0 : NODE *root = maketree(ptr);
193 EUB :
194 UIC 0 : return plaintree(clean_NOT_intree(root), len);
195 EUB : }
196 :
197 :
198 : /*
199 : * Remove QI_VALSTOP (stopword) nodes from query tree.
200 : *
201 : * Returns NULL if the query degenerates to nothing. Input must not be NULL.
202 : *
203 : * When we remove a phrase operator due to removing one or both of its
204 : * arguments, we might need to adjust the distance of a parent phrase
205 : * operator. For example, 'a' is a stopword, so:
206 : * (b <-> a) <-> c should become b <2> c
207 : * b <-> (a <-> c) should become b <2> c
208 : * (b <-> (a <-> a)) <-> c should become b <3> c
209 : * b <-> ((a <-> a) <-> c) should become b <3> c
210 : * To handle that, we define two output parameters:
211 : * ladd: amount to add to a phrase distance to the left of this node
212 : * radd: amount to add to a phrase distance to the right of this node
213 : * We need two outputs because we could need to bubble up adjustments to two
214 : * different parent phrase operators. Consider
215 : * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
216 : * After we've removed the two a's and are considering the <2> node (which is
217 : * now just x <2> y), we have an ladd distance of 1 that needs to propagate
218 : * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
219 : * propagate to the rightmost <->, so that we'll end up with
220 : * w <2> ((x <2> y) <4> z)
221 : * Near the bottom of the tree, we may have subtrees consisting only of
222 : * stopwords. The distances of any phrase operators within such a subtree are
223 : * summed and propagated to both ladd and radd, since we don't know which side
224 : * of the lowest surviving phrase operator we are in. The rule is that any
225 : * subtree that degenerates to NULL must return equal values of ladd and radd,
226 : * and the parent node dealing with it should incorporate only one of those.
227 : *
228 : * Currently, we only implement this adjustment for adjacent phrase operators.
229 : * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
230 : * isn't ideal, but there is no way to represent the really desired semantics
231 : * without some redesign of the tsquery structure. Certainly it would not be
232 : * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
233 : * corner case, let it go for now. But we can fix it in cases where the
234 : * intervening non-phrase operator also gets removed, for example
235 : * '((x <-> a) | a) <-> y' will become 'x <2> y'.
236 : */
237 : static NODE *
238 GIC 1530 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
239 ECB : {
240 : /* since this function recurses, it could be driven to stack overflow. */
241 GIC 1530 : check_stack_depth();
242 ECB :
243 : /* default output parameters indicate no change in parent distance */
244 GIC 1530 : *ladd = *radd = 0;
245 ECB :
246 GIC 1530 : if (node->valnode->type == QI_VAL)
247 CBC 513 : return node;
248 1017 : else if (node->valnode->type == QI_VALSTOP)
249 ECB : {
250 GIC 342 : pfree(node);
251 CBC 342 : return NULL;
252 ECB : }
253 :
254 GIC 675 : Assert(node->valnode->type == QI_OPR);
255 ECB :
256 GIC 675 : if (node->valnode->qoperator.oper == OP_NOT)
257 ECB : {
258 : /* NOT doesn't change pattern width, so just report child distances */
259 GIC 42 : node->right = clean_stopword_intree(node->right, ladd, radd);
260 CBC 42 : if (!node->right)
261 ECB : {
262 GIC 9 : freetree(node);
263 CBC 9 : return NULL;
264 ECB : }
265 : }
266 : else
267 : {
268 GIC 633 : NODE *res = node;
269 ECB : bool isphrase;
270 : int ndistance,
271 : lladd,
272 : lradd,
273 : rladd,
274 : rradd;
275 :
276 : /* First, recurse */
277 GIC 633 : node->left = clean_stopword_intree(node->left, &lladd, &lradd);
278 CBC 633 : node->right = clean_stopword_intree(node->right, &rladd, &rradd);
279 ECB :
280 : /* Check if current node is OP_PHRASE, get its distance */
281 GIC 633 : isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
282 CBC 633 : ndistance = isphrase ? node->valnode->qoperator.distance : 0;
283 ECB :
284 GIC 633 : if (node->left == NULL && node->right == NULL)
285 ECB : {
286 : /*
287 : * When we collapse out a phrase node entirely, propagate its own
288 : * distance into both *ladd and *radd; it is the responsibility of
289 : * the parent node to count it only once. Also, for a phrase
290 : * node, distances coming from children are summed and propagated
291 : * up to parent (we assume lladd == lradd and rladd == rradd, else
292 : * rule was broken at a lower level). But if this isn't a phrase
293 : * node, take the larger of the two child distances; that
294 : * corresponds to what TS_execute will do in non-stopword cases.
295 : */
296 GIC 12 : if (isphrase)
297 LBC 0 : *ladd = *radd = lladd + ndistance + rladd;
298 EUB : else
299 GIC 12 : *ladd = *radd = Max(lladd, rladd);
300 CBC 12 : freetree(node);
301 12 : return NULL;
302 ECB : }
303 GIC 621 : else if (node->left == NULL)
304 ECB : {
305 : /* Removing this operator and left subnode */
306 : /* lladd and lradd are equal/redundant, don't count both */
307 GIC 126 : if (isphrase)
308 ECB : {
309 : /* operator's own distance must propagate to left */
310 GIC 81 : *ladd = lladd + ndistance + rladd;
311 CBC 81 : *radd = rradd;
312 ECB : }
313 : else
314 : {
315 : /* at non-phrase op, just forget the left subnode entirely */
316 GIC 45 : *ladd = rladd;
317 CBC 45 : *radd = rradd;
318 ECB : }
319 GIC 126 : res = node->right;
320 CBC 126 : pfree(node);
321 ECB : }
322 GIC 495 : else if (node->right == NULL)
323 ECB : {
324 : /* Removing this operator and right subnode */
325 : /* rladd and rradd are equal/redundant, don't count both */
326 GIC 192 : if (isphrase)
327 ECB : {
328 : /* operator's own distance must propagate to right */
329 GIC 108 : *ladd = lladd;
330 CBC 108 : *radd = lradd + ndistance + rradd;
331 ECB : }
332 : else
333 : {
334 : /* at non-phrase op, just forget the right subnode entirely */
335 GIC 84 : *ladd = lladd;
336 CBC 84 : *radd = lradd;
337 ECB : }
338 GIC 192 : res = node->left;
339 CBC 192 : pfree(node);
340 ECB : }
341 GIC 303 : else if (isphrase)
342 ECB : {
343 : /* Absorb appropriate corrections at this level */
344 GIC 171 : node->valnode->qoperator.distance += lradd + rladd;
345 ECB : /* Propagate up any unaccounted-for corrections */
346 GIC 171 : *ladd = lladd;
347 CBC 171 : *radd = rradd;
348 ECB : }
349 : else
350 : {
351 : /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
352 : }
353 :
354 GIC 621 : return res;
355 ECB : }
356 GIC 33 : return node;
357 ECB : }
358 :
359 : /*
360 : * Number of elements in query tree
361 : */
362 : static int32
363 GIC 849 : calcstrlen(NODE *node)
364 ECB : {
365 GIC 849 : int32 size = 0;
366 ECB :
367 GIC 849 : if (node->valnode->type == QI_VAL)
368 ECB : {
369 GIC 513 : size = node->valnode->qoperand.length + 1;
370 ECB : }
371 : else
372 : {
373 GIC 336 : Assert(node->valnode->type == QI_OPR);
374 ECB :
375 GIC 336 : size = calcstrlen(node->right);
376 CBC 336 : if (node->valnode->qoperator.oper != OP_NOT)
377 303 : size += calcstrlen(node->left);
378 ECB : }
379 :
380 GIC 849 : return size;
381 ECB : }
382 :
383 : /*
384 : * Remove QI_VALSTOP (stopword) nodes from TSQuery.
385 : */
386 : TSQuery
387 GNC 222 : cleanup_tsquery_stopwords(TSQuery in, bool noisy)
388 ECB : {
389 : int32 len,
390 : lenstr,
391 : commonlen,
392 : i;
393 : NODE *root;
394 : int ladd,
395 : radd;
396 : TSQuery out;
397 : QueryItem *items;
398 : char *operands;
399 :
400 GIC 222 : if (in->size == 0)
401 LBC 0 : return in;
402 EUB :
403 : /* eliminate stop words */
404 GIC 222 : root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
405 CBC 222 : if (root == NULL)
406 ECB : {
407 GNC 12 : if (noisy)
408 12 : ereport(NOTICE,
409 : (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
410 CBC 12 : out = palloc(HDRSIZETQ);
411 GIC 12 : out->size = 0;
412 CBC 12 : SET_VARSIZE(out, HDRSIZETQ);
413 12 : return out;
414 ECB : }
415 :
416 : /*
417 : * Build TSQuery from plain view
418 : */
419 :
420 GIC 210 : lenstr = calcstrlen(root);
421 210 : items = plaintree(root, &len);
422 CBC 210 : commonlen = COMPUTESIZE(len, lenstr);
423 ECB :
424 CBC 210 : out = palloc(commonlen);
425 GIC 210 : SET_VARSIZE(out, commonlen);
426 CBC 210 : out->size = len;
427 ECB :
428 CBC 210 : memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
429 :
430 210 : items = GETQUERY(out);
431 GIC 210 : operands = GETOPERAND(out);
432 CBC 1059 : for (i = 0; i < out->size; i++)
433 ECB : {
434 CBC 849 : QueryOperand *op = (QueryOperand *) &items[i];
435 :
436 849 : if (op->type != QI_VAL)
437 GIC 336 : continue;
438 ECB :
439 CBC 513 : memcpy(operands, GETOPERAND(in) + op->distance, op->length);
440 GIC 513 : operands[op->length] = '\0';
441 CBC 513 : op->distance = operands - GETOPERAND(out);
442 513 : operands += op->length + 1;
443 ECB : }
444 :
445 GIC 210 : return out;
446 : }
|