Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * like_support.c
4 : * Planner support functions for LIKE, regex, and related operators.
5 : *
6 : * These routines handle special optimization of operators that can be
7 : * used with index scans even though they are not known to the executor's
8 : * indexscan machinery. The key idea is that these operators allow us
9 : * to derive approximate indexscan qual clauses, such that any tuples
10 : * that pass the operator clause itself must also satisfy the simpler
11 : * indexscan condition(s). Then we can use the indexscan machinery
12 : * to avoid scanning as much of the table as we'd otherwise have to,
13 : * while applying the original operator as a qpqual condition to ensure
14 : * we deliver only the tuples we want. (In essence, we're using a regular
15 : * index as if it were a lossy index.)
16 : *
17 : * An example of what we're doing is
18 : * textfield LIKE 'abc%def'
19 : * from which we can generate the indexscanable conditions
20 : * textfield >= 'abc' AND textfield < 'abd'
21 : * which allow efficient scanning of an index on textfield.
22 : * (In reality, character set and collation issues make the transformation
23 : * from LIKE to indexscan limits rather harder than one might think ...
24 : * but that's the basic idea.)
25 : *
26 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/adt/like_support.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include <math.h>
38 :
39 : #include "access/htup_details.h"
40 : #include "access/stratnum.h"
41 : #include "catalog/pg_collation.h"
42 : #include "catalog/pg_operator.h"
43 : #include "catalog/pg_opfamily.h"
44 : #include "catalog/pg_statistic.h"
45 : #include "catalog/pg_type.h"
46 : #include "mb/pg_wchar.h"
47 : #include "miscadmin.h"
48 : #include "nodes/makefuncs.h"
49 : #include "nodes/nodeFuncs.h"
50 : #include "nodes/supportnodes.h"
51 : #include "utils/builtins.h"
52 : #include "utils/datum.h"
53 : #include "utils/lsyscache.h"
54 : #include "utils/pg_locale.h"
55 : #include "utils/selfuncs.h"
56 : #include "utils/varlena.h"
57 :
58 :
59 : typedef enum
60 : {
61 : Pattern_Type_Like,
62 : Pattern_Type_Like_IC,
63 : Pattern_Type_Regex,
64 : Pattern_Type_Regex_IC,
65 : Pattern_Type_Prefix
66 : } Pattern_Type;
67 :
68 : typedef enum
69 : {
70 : Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
71 : } Pattern_Prefix_Status;
72 :
73 : static Node *like_regex_support(Node *rawreq, Pattern_Type ptype);
74 : static List *match_pattern_prefix(Node *leftop,
75 : Node *rightop,
76 : Pattern_Type ptype,
77 : Oid expr_coll,
78 : Oid opfamily,
79 : Oid indexcollation);
80 : static double patternsel_common(PlannerInfo *root,
81 : Oid oprid,
82 : Oid opfuncid,
83 : List *args,
84 : int varRelid,
85 : Oid collation,
86 : Pattern_Type ptype,
87 : bool negate);
88 : static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
89 : Pattern_Type ptype,
90 : Oid collation,
91 : Const **prefix,
92 : Selectivity *rest_selec);
93 : static Selectivity prefix_selectivity(PlannerInfo *root,
94 : VariableStatData *vardata,
95 : Oid eqopr, Oid ltopr, Oid geopr,
96 : Oid collation,
97 : Const *prefixcon);
98 : static Selectivity like_selectivity(const char *patt, int pattlen,
99 : bool case_insensitive);
100 : static Selectivity regex_selectivity(const char *patt, int pattlen,
101 : bool case_insensitive,
102 : int fixed_prefix_len);
103 : static int pattern_char_isalpha(char c, bool is_multibyte,
104 : pg_locale_t locale, bool locale_is_c);
105 : static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
106 : Oid collation);
107 : static Datum string_to_datum(const char *str, Oid datatype);
108 : static Const *string_to_const(const char *str, Oid datatype);
109 : static Const *string_to_bytea_const(const char *str, size_t str_len);
110 :
111 :
112 : /*
113 : * Planner support functions for LIKE, regex, and related operators
114 : */
115 : Datum
1518 tgl 116 CBC 2956 : textlike_support(PG_FUNCTION_ARGS)
117 : {
118 2956 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
119 :
120 2956 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
121 : }
122 :
123 : Datum
124 206 : texticlike_support(PG_FUNCTION_ARGS)
125 : {
126 206 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
127 :
128 206 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
129 : }
130 :
131 : Datum
132 9670 : textregexeq_support(PG_FUNCTION_ARGS)
133 : {
134 9670 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
135 :
136 9670 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
137 : }
138 :
139 : Datum
140 86 : texticregexeq_support(PG_FUNCTION_ARGS)
141 : {
142 86 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
143 :
144 86 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
145 : }
146 :
147 : Datum
508 148 78 : text_starts_with_support(PG_FUNCTION_ARGS)
149 : {
150 78 : Node *rawreq = (Node *) PG_GETARG_POINTER(0);
151 :
152 78 : PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Prefix));
153 : }
154 :
155 : /* Common code for the above */
156 : static Node *
1518 157 12996 : like_regex_support(Node *rawreq, Pattern_Type ptype)
158 : {
159 12996 : Node *ret = NULL;
160 :
1515 161 12996 : if (IsA(rawreq, SupportRequestSelectivity))
162 : {
163 : /*
164 : * Make a selectivity estimate for a function call, just as we'd do if
165 : * the call was via the corresponding operator.
166 : */
167 12 : SupportRequestSelectivity *req = (SupportRequestSelectivity *) rawreq;
168 : Selectivity s1;
169 :
170 12 : if (req->is_join)
171 : {
172 : /*
173 : * For the moment we just punt. If patternjoinsel is ever
174 : * improved to do better, this should be made to call it.
175 : */
1515 tgl 176 UBC 0 : s1 = DEFAULT_MATCH_SEL;
177 : }
178 : else
179 : {
180 : /* Share code with operator restriction selectivity functions */
1515 tgl 181 CBC 12 : s1 = patternsel_common(req->root,
182 : InvalidOid,
183 : req->funcid,
184 : req->args,
185 : req->varRelid,
186 : req->inputcollid,
187 : ptype,
188 : false);
189 : }
190 12 : req->selectivity = s1;
191 12 : ret = (Node *) req;
192 : }
193 12984 : else if (IsA(rawreq, SupportRequestIndexCondition))
194 : {
195 : /* Try to convert operator/function call to index conditions */
1518 196 3534 : SupportRequestIndexCondition *req = (SupportRequestIndexCondition *) rawreq;
197 :
198 : /*
199 : * Currently we have no "reverse" match operators with the pattern on
200 : * the left, so we only need consider cases with the indexkey on the
201 : * left.
202 : */
203 3534 : if (req->indexarg != 0)
1518 tgl 204 UBC 0 : return NULL;
205 :
1518 tgl 206 CBC 3534 : if (is_opclause(req->node))
207 : {
208 3522 : OpExpr *clause = (OpExpr *) req->node;
209 :
210 3522 : Assert(list_length(clause->args) == 2);
211 : ret = (Node *)
212 3522 : match_pattern_prefix((Node *) linitial(clause->args),
213 3522 : (Node *) lsecond(clause->args),
214 : ptype,
215 : clause->inputcollid,
216 : req->opfamily,
217 : req->indexcollation);
218 : }
219 12 : else if (is_funcclause(req->node)) /* be paranoid */
220 : {
221 12 : FuncExpr *clause = (FuncExpr *) req->node;
222 :
223 12 : Assert(list_length(clause->args) == 2);
224 : ret = (Node *)
225 12 : match_pattern_prefix((Node *) linitial(clause->args),
226 12 : (Node *) lsecond(clause->args),
227 : ptype,
228 : clause->inputcollid,
229 : req->opfamily,
230 : req->indexcollation);
231 : }
232 : }
233 :
234 12996 : return ret;
235 : }
236 :
237 : /*
238 : * match_pattern_prefix
239 : * Try to generate an indexqual for a LIKE or regex operator.
240 : */
241 : static List *
242 3534 : match_pattern_prefix(Node *leftop,
243 : Node *rightop,
244 : Pattern_Type ptype,
245 : Oid expr_coll,
246 : Oid opfamily,
247 : Oid indexcollation)
248 : {
249 : List *result;
250 : Const *patt;
251 : Const *prefix;
252 : Pattern_Prefix_Status pstatus;
253 : Oid ldatatype;
254 : Oid rdatatype;
255 : Oid eqopr;
256 : Oid ltopr;
257 : Oid geopr;
508 258 3534 : Oid preopr = InvalidOid;
259 : bool collation_aware;
260 : Expr *expr;
261 : FmgrInfo ltproc;
262 : Const *greaterstr;
263 :
264 : /*
265 : * Can't do anything with a non-constant or NULL pattern argument.
266 : *
267 : * Note that since we restrict ourselves to cases with a hard constant on
268 : * the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
269 : * about verifying that.
270 : */
1518 271 3534 : if (!IsA(rightop, Const) ||
272 3484 : ((Const *) rightop)->constisnull)
273 50 : return NIL;
274 3484 : patt = (Const *) rightop;
275 :
276 : /*
277 : * Not supported if the expression collation is nondeterministic. The
278 : * optimized equality or prefix tests use bytewise comparisons, which is
279 : * not consistent with nondeterministic collations. The actual
280 : * pattern-matching implementation functions will later error out that
281 : * pattern-matching is not supported with nondeterministic collations. (We
282 : * could also error out here, but by doing it later we get more precise
283 : * error messages.) (It should be possible to support at least
284 : * Pattern_Prefix_Exact, but no point as long as the actual
285 : * pattern-matching implementations don't support it.)
286 : *
287 : * expr_coll is not set for a non-collation-aware data type such as bytea.
288 : */
1455 peter 289 3484 : if (expr_coll && !get_collation_isdeterministic(expr_coll))
1479 290 6 : return NIL;
291 :
292 : /*
293 : * Try to extract a fixed prefix from the pattern.
294 : */
1518 tgl 295 3478 : pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
296 : &prefix, NULL);
297 :
298 : /* fail if no fixed prefix */
299 3478 : if (pstatus == Pattern_Prefix_None)
300 155 : return NIL;
301 :
302 : /*
303 : * Identify the operators we want to use, based on the type of the
304 : * left-hand argument. Usually these are just the type's regular
305 : * comparison operators, but if we are considering one of the semi-legacy
306 : * "pattern" opclasses, use the "pattern" operators instead. Those are
307 : * not collation-sensitive but always use C collation, as we want. The
308 : * selected operators also determine the needed type of the prefix
309 : * constant.
310 : */
1236 311 3323 : ldatatype = exprType(leftop);
312 3323 : switch (ldatatype)
313 : {
314 34 : case TEXTOID:
508 315 34 : if (opfamily == TEXT_PATTERN_BTREE_FAM_OID)
316 : {
508 tgl 317 UBC 0 : eqopr = TextEqualOperator;
318 0 : ltopr = TextPatternLessOperator;
319 0 : geopr = TextPatternGreaterEqualOperator;
320 0 : collation_aware = false;
321 : }
508 tgl 322 CBC 34 : else if (opfamily == TEXT_SPGIST_FAM_OID)
323 : {
1236 324 12 : eqopr = TextEqualOperator;
325 12 : ltopr = TextPatternLessOperator;
326 12 : geopr = TextPatternGreaterEqualOperator;
327 : /* This opfamily has direct support for prefixing */
508 328 12 : preopr = TextPrefixOperator;
1236 329 12 : collation_aware = false;
330 : }
331 : else
332 : {
333 22 : eqopr = TextEqualOperator;
334 22 : ltopr = TextLessOperator;
335 22 : geopr = TextGreaterEqualOperator;
336 22 : collation_aware = true;
337 : }
1518 338 34 : rdatatype = TEXTOID;
339 34 : break;
1236 340 3277 : case NAMEOID:
341 :
342 : /*
343 : * Note that here, we need the RHS type to be text, so that the
344 : * comparison value isn't improperly truncated to NAMEDATALEN.
345 : */
346 3277 : eqopr = NameEqualTextOperator;
347 3277 : ltopr = NameLessTextOperator;
348 3277 : geopr = NameGreaterEqualTextOperator;
349 3277 : collation_aware = true;
1518 350 3277 : rdatatype = TEXTOID;
351 3277 : break;
1236 352 12 : case BPCHAROID:
353 12 : if (opfamily == BPCHAR_PATTERN_BTREE_FAM_OID)
354 : {
1236 tgl 355 UBC 0 : eqopr = BpcharEqualOperator;
356 0 : ltopr = BpcharPatternLessOperator;
357 0 : geopr = BpcharPatternGreaterEqualOperator;
358 0 : collation_aware = false;
359 : }
360 : else
361 : {
1236 tgl 362 CBC 12 : eqopr = BpcharEqualOperator;
363 12 : ltopr = BpcharLessOperator;
364 12 : geopr = BpcharGreaterEqualOperator;
365 12 : collation_aware = true;
366 : }
1518 367 12 : rdatatype = BPCHAROID;
368 12 : break;
1236 tgl 369 UBC 0 : case BYTEAOID:
370 0 : eqopr = ByteaEqualOperator;
371 0 : ltopr = ByteaLessOperator;
372 0 : geopr = ByteaGreaterEqualOperator;
373 0 : collation_aware = false;
1518 374 0 : rdatatype = BYTEAOID;
375 0 : break;
376 0 : default:
377 : /* Can't get here unless we're attached to the wrong operator */
378 0 : return NIL;
379 : }
380 :
381 : /*
382 : * If necessary, coerce the prefix constant to the right type. The given
383 : * prefix constant is either text or bytea type, therefore the only case
384 : * where we need to do anything is when converting text to bpchar. Those
385 : * two types are binary-compatible, so relabeling the Const node is
386 : * sufficient.
387 : */
1518 tgl 388 CBC 3323 : if (prefix->consttype != rdatatype)
389 : {
390 12 : Assert(prefix->consttype == TEXTOID &&
391 : rdatatype == BPCHAROID);
392 12 : prefix->consttype = rdatatype;
393 : }
394 :
395 : /*
396 : * If we found an exact-match pattern, generate an "=" indexqual.
397 : *
398 : * Here and below, check to see whether the desired operator is actually
399 : * supported by the index opclass, and fail quietly if not. This allows
400 : * us to not be concerned with specific opclasses (except for the legacy
401 : * "pattern" cases); any index that correctly implements the operators
402 : * will work.
403 : */
404 3323 : if (pstatus == Pattern_Prefix_Exact)
405 : {
1236 406 2710 : if (!op_in_opfamily(eqopr, opfamily))
1237 407 6 : return NIL;
1236 408 2704 : expr = make_opclause(eqopr, BOOLOID, false,
409 : (Expr *) leftop, (Expr *) prefix,
410 : InvalidOid, indexcollation);
1518 411 2704 : result = list_make1(expr);
412 2704 : return result;
413 : }
414 :
415 : /*
416 : * Otherwise, we have a nonempty required prefix of the values. Some
417 : * opclasses support prefix checks directly, otherwise we'll try to
418 : * generate a range constraint.
419 : */
508 420 613 : if (OidIsValid(preopr) && op_in_opfamily(preopr, opfamily))
421 : {
422 12 : expr = make_opclause(preopr, BOOLOID, false,
423 : (Expr *) leftop, (Expr *) prefix,
424 : InvalidOid, indexcollation);
425 12 : result = list_make1(expr);
426 12 : return result;
427 : }
428 :
429 : /*
430 : * Since we need a range constraint, it's only going to work reliably if
431 : * the index is collation-insensitive or has "C" collation. Note that
432 : * here we are looking at the index's collation, not the expression's
433 : * collation -- this test is *not* dependent on the LIKE/regex operator's
434 : * collation.
435 : */
436 601 : if (collation_aware &&
437 601 : !lc_collate_is_c(indexcollation))
438 4 : return NIL;
439 :
440 : /*
441 : * We can always say "x >= prefix".
442 : */
1236 443 597 : if (!op_in_opfamily(geopr, opfamily))
1237 444 6 : return NIL;
1236 445 591 : expr = make_opclause(geopr, BOOLOID, false,
446 : (Expr *) leftop, (Expr *) prefix,
447 : InvalidOid, indexcollation);
1518 448 591 : result = list_make1(expr);
449 :
450 : /*-------
451 : * If we can create a string larger than the prefix, we can say
452 : * "x < greaterstr". NB: we rely on make_greater_string() to generate
453 : * a guaranteed-greater string, not just a probably-greater string.
454 : * In general this is only guaranteed in C locale, so we'd better be
455 : * using a C-locale index collation.
456 : *-------
457 : */
1236 458 591 : if (!op_in_opfamily(ltopr, opfamily))
1237 tgl 459 UBC 0 : return result;
1236 tgl 460 CBC 591 : fmgr_info(get_opcode(ltopr), <proc);
1518 461 591 : greaterstr = make_greater_string(prefix, <proc, indexcollation);
462 591 : if (greaterstr)
463 : {
1236 464 591 : expr = make_opclause(ltopr, BOOLOID, false,
465 : (Expr *) leftop, (Expr *) greaterstr,
466 : InvalidOid, indexcollation);
1518 467 591 : result = lappend(result, expr);
468 : }
469 :
470 591 : return result;
471 : }
472 :
473 :
474 : /*
475 : * patternsel_common - generic code for pattern-match restriction selectivity.
476 : *
477 : * To support using this from either the operator or function paths, caller
478 : * may pass either operator OID or underlying function OID; we look up the
479 : * latter from the former if needed. (We could just have patternsel() call
480 : * get_opcode(), but the work would be wasted if we don't have a need to
481 : * compare a fixed prefix to the pg_statistic data.)
482 : *
483 : * Note that oprid and/or opfuncid should be for the positive-match operator
484 : * even when negate is true.
485 : */
486 : static double
1515 487 4847 : patternsel_common(PlannerInfo *root,
488 : Oid oprid,
489 : Oid opfuncid,
490 : List *args,
491 : int varRelid,
492 : Oid collation,
493 : Pattern_Type ptype,
494 : bool negate)
495 : {
496 : VariableStatData vardata;
497 : Node *other;
498 : bool varonleft;
499 : Datum constval;
500 : Oid consttype;
501 : Oid vartype;
502 : Oid rdatatype;
503 : Oid eqopr;
504 : Oid ltopr;
505 : Oid geopr;
506 : Pattern_Prefix_Status pstatus;
507 : Const *patt;
508 4847 : Const *prefix = NULL;
509 4847 : Selectivity rest_selec = 0;
510 4847 : double nullfrac = 0.0;
511 : double result;
512 :
513 : /*
514 : * Initialize result to the appropriate default estimate depending on
515 : * whether it's a match or not-match operator.
516 : */
517 4847 : if (negate)
518 469 : result = 1.0 - DEFAULT_MATCH_SEL;
519 : else
520 4378 : result = DEFAULT_MATCH_SEL;
521 :
522 : /*
523 : * If expression is not variable op constant, then punt and return the
524 : * default estimate.
525 : */
526 4847 : if (!get_restriction_variable(root, args, varRelid,
527 : &vardata, &other, &varonleft))
528 68 : return result;
529 4779 : if (!varonleft || !IsA(other, Const))
530 : {
531 20 : ReleaseVariableStats(vardata);
532 20 : return result;
533 : }
534 :
535 : /*
536 : * If the constant is NULL, assume operator is strict and return zero, ie,
537 : * operator will never return TRUE. (It's zero even for a negator op.)
538 : */
539 4759 : if (((Const *) other)->constisnull)
540 : {
1515 tgl 541 UBC 0 : ReleaseVariableStats(vardata);
542 0 : return 0.0;
543 : }
1515 tgl 544 CBC 4759 : constval = ((Const *) other)->constvalue;
545 4759 : consttype = ((Const *) other)->consttype;
546 :
547 : /*
548 : * The right-hand const is type text or bytea for all supported operators.
549 : * We do not expect to see binary-compatible types here, since
550 : * const-folding should have relabeled the const to exactly match the
551 : * operator's declared type.
552 : */
553 4759 : if (consttype != TEXTOID && consttype != BYTEAOID)
554 : {
555 12 : ReleaseVariableStats(vardata);
556 12 : return result;
557 : }
558 :
559 : /*
560 : * Similarly, the exposed type of the left-hand side should be one of
561 : * those we know. (Do not look at vardata.atttype, which might be
562 : * something binary-compatible but different.) We can use it to identify
563 : * the comparison operators and the required type of the comparison
564 : * constant, much as in match_pattern_prefix().
565 : */
566 4747 : vartype = vardata.vartype;
567 :
568 4747 : switch (vartype)
569 : {
570 953 : case TEXTOID:
1236 571 953 : eqopr = TextEqualOperator;
572 953 : ltopr = TextLessOperator;
573 953 : geopr = TextGreaterEqualOperator;
574 953 : rdatatype = TEXTOID;
575 953 : break;
1515 576 3747 : case NAMEOID:
577 :
578 : /*
579 : * Note that here, we need the RHS type to be text, so that the
580 : * comparison value isn't improperly truncated to NAMEDATALEN.
581 : */
1236 582 3747 : eqopr = NameEqualTextOperator;
583 3747 : ltopr = NameLessTextOperator;
584 3747 : geopr = NameGreaterEqualTextOperator;
585 3747 : rdatatype = TEXTOID;
1515 586 3747 : break;
587 42 : case BPCHAROID:
1236 588 42 : eqopr = BpcharEqualOperator;
589 42 : ltopr = BpcharLessOperator;
590 42 : geopr = BpcharGreaterEqualOperator;
591 42 : rdatatype = BPCHAROID;
1515 592 42 : break;
593 3 : case BYTEAOID:
1236 594 3 : eqopr = ByteaEqualOperator;
595 3 : ltopr = ByteaLessOperator;
596 3 : geopr = ByteaGreaterEqualOperator;
597 3 : rdatatype = BYTEAOID;
1515 598 3 : break;
599 2 : default:
600 : /* Can't get here unless we're attached to the wrong operator */
601 2 : ReleaseVariableStats(vardata);
602 2 : return result;
603 : }
604 :
605 : /*
606 : * Grab the nullfrac for use below.
607 : */
608 4745 : if (HeapTupleIsValid(vardata.statsTuple))
609 : {
610 : Form_pg_statistic stats;
611 :
612 3576 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
613 3576 : nullfrac = stats->stanullfrac;
614 : }
615 :
616 : /*
617 : * Pull out any fixed prefix implied by the pattern, and estimate the
618 : * fractional selectivity of the remainder of the pattern. Unlike many
619 : * other selectivity estimators, we use the pattern operator's actual
620 : * collation for this step. This is not because we expect the collation
621 : * to make a big difference in the selectivity estimate (it seldom would),
622 : * but because we want to be sure we cache compiled regexps under the
623 : * right cache key, so that they can be re-used at runtime.
624 : */
625 4745 : patt = (Const *) other;
626 4745 : pstatus = pattern_fixed_prefix(patt, ptype, collation,
627 : &prefix, &rest_selec);
628 :
629 : /*
630 : * If necessary, coerce the prefix constant to the right type. The only
631 : * case where we need to do anything is when converting text to bpchar.
632 : * Those two types are binary-compatible, so relabeling the Const node is
633 : * sufficient.
634 : */
1236 635 4733 : if (prefix && prefix->consttype != rdatatype)
636 : {
637 18 : Assert(prefix->consttype == TEXTOID &&
638 : rdatatype == BPCHAROID);
639 18 : prefix->consttype = rdatatype;
640 : }
641 :
1515 642 4733 : if (pstatus == Pattern_Prefix_Exact)
643 : {
644 : /*
645 : * Pattern specifies an exact match, so estimate as for '='
646 : */
1038 647 2796 : result = var_eq_const(&vardata, eqopr, collation, prefix->constvalue,
648 : false, true, false);
649 : }
650 : else
651 : {
652 : /*
653 : * Not exact-match pattern. If we have a sufficiently large
654 : * histogram, estimate selectivity for the histogram part of the
655 : * population by counting matches in the histogram. If not, estimate
656 : * selectivity of the fixed prefix and remainder of pattern
657 : * separately, then combine the two to get an estimate of the
658 : * selectivity for the part of the column population represented by
659 : * the histogram. (For small histograms, we combine these
660 : * approaches.)
661 : *
662 : * We then add up data for any most-common-values values; these are
663 : * not in the histogram population, and we can get exact answers for
664 : * them by applying the pattern operator, so there's no reason to
665 : * approximate. (If the MCVs cover a significant part of the total
666 : * population, this gives us a big leg up in accuracy.)
667 : */
668 : Selectivity selec;
669 : int hist_size;
670 : FmgrInfo opproc;
671 : double mcv_selec,
672 : sumcommon;
673 :
674 : /* Try to use the histogram entries to get selectivity */
1515 675 1937 : if (!OidIsValid(opfuncid))
676 1925 : opfuncid = get_opcode(oprid);
677 1937 : fmgr_info(opfuncid, &opproc);
678 :
1038 679 1937 : selec = histogram_selectivity(&vardata, &opproc, collation,
680 : constval, true,
681 : 10, 1, &hist_size);
682 :
683 : /* If not at least 100 entries, use the heuristic method */
1515 684 1937 : if (hist_size < 100)
685 : {
686 : Selectivity heursel;
687 : Selectivity prefixsel;
688 :
689 1298 : if (pstatus == Pattern_Prefix_Partial)
1236 690 923 : prefixsel = prefix_selectivity(root, &vardata,
691 : eqopr, ltopr, geopr,
692 : collation,
693 : prefix);
694 : else
1515 695 375 : prefixsel = 1.0;
696 1298 : heursel = prefixsel * rest_selec;
697 :
698 1298 : if (selec < 0) /* fewer than 10 histogram entries? */
699 1154 : selec = heursel;
700 : else
701 : {
702 : /*
703 : * For histogram sizes from 10 to 100, we combine the
704 : * histogram and heuristic selectivities, putting increasingly
705 : * more trust in the histogram for larger sizes.
706 : */
707 144 : double hist_weight = hist_size / 100.0;
708 :
709 144 : selec = selec * hist_weight + heursel * (1.0 - hist_weight);
710 : }
711 : }
712 :
713 : /* In any case, don't believe extremely small or large estimates. */
714 1937 : if (selec < 0.0001)
715 663 : selec = 0.0001;
716 1274 : else if (selec > 0.9999)
717 68 : selec = 0.9999;
718 :
719 : /*
720 : * If we have most-common-values info, add up the fractions of the MCV
721 : * entries that satisfy MCV OP PATTERN. These fractions contribute
722 : * directly to the result selectivity. Also add up the total fraction
723 : * represented by MCV entries.
724 : */
1038 725 1937 : mcv_selec = mcv_selectivity(&vardata, &opproc, collation,
726 : constval, true,
727 : &sumcommon);
728 :
729 : /*
730 : * Now merge the results from the MCV and histogram calculations,
731 : * realizing that the histogram covers only the non-null values that
732 : * are not listed in MCV.
733 : */
1515 734 1937 : selec *= 1.0 - nullfrac - sumcommon;
735 1937 : selec += mcv_selec;
736 1937 : result = selec;
737 : }
738 :
739 : /* now adjust if we wanted not-match rather than match */
740 4733 : if (negate)
741 413 : result = 1.0 - result - nullfrac;
742 :
743 : /* result should be in range, but make sure... */
744 4733 : CLAMP_PROBABILITY(result);
745 :
746 4733 : if (prefix)
747 : {
748 4433 : pfree(DatumGetPointer(prefix->constvalue));
749 4433 : pfree(prefix);
750 : }
751 :
752 4733 : ReleaseVariableStats(vardata);
753 :
754 4733 : return result;
755 : }
756 :
757 : /*
758 : * Fix impedance mismatch between SQL-callable functions and patternsel_common
759 : */
760 : static double
761 4835 : patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
762 : {
763 4835 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
764 4835 : Oid operator = PG_GETARG_OID(1);
765 4835 : List *args = (List *) PG_GETARG_POINTER(2);
766 4835 : int varRelid = PG_GETARG_INT32(3);
767 4835 : Oid collation = PG_GET_COLLATION();
768 :
769 : /*
770 : * If this is for a NOT LIKE or similar operator, get the corresponding
771 : * positive-match operator and work with that.
772 : */
773 4835 : if (negate)
774 : {
775 469 : operator = get_negator(operator);
776 469 : if (!OidIsValid(operator))
1515 tgl 777 UBC 0 : elog(ERROR, "patternsel called for operator without a negator");
778 : }
779 :
1515 tgl 780 CBC 4835 : return patternsel_common(root,
781 : operator,
782 : InvalidOid,
783 : args,
784 : varRelid,
785 : collation,
786 : ptype,
787 : negate);
788 : }
789 :
790 : /*
791 : * regexeqsel - Selectivity of regular-expression pattern match.
792 : */
793 : Datum
794 3212 : regexeqsel(PG_FUNCTION_ARGS)
795 : {
796 3212 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
797 : }
798 :
799 : /*
800 : * icregexeqsel - Selectivity of case-insensitive regex match.
801 : */
802 : Datum
803 44 : icregexeqsel(PG_FUNCTION_ARGS)
804 : {
805 44 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
806 : }
807 :
808 : /*
809 : * likesel - Selectivity of LIKE pattern match.
810 : */
811 : Datum
812 1007 : likesel(PG_FUNCTION_ARGS)
813 : {
814 1007 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
815 : }
816 :
817 : /*
818 : * prefixsel - selectivity of prefix operator
819 : */
820 : Datum
821 27 : prefixsel(PG_FUNCTION_ARGS)
822 : {
823 27 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
824 : }
825 :
826 : /*
827 : *
828 : * iclikesel - Selectivity of ILIKE pattern match.
829 : */
830 : Datum
831 76 : iclikesel(PG_FUNCTION_ARGS)
832 : {
833 76 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
834 : }
835 :
836 : /*
837 : * regexnesel - Selectivity of regular-expression pattern non-match.
838 : */
839 : Datum
840 392 : regexnesel(PG_FUNCTION_ARGS)
841 : {
842 392 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
843 : }
844 :
845 : /*
846 : * icregexnesel - Selectivity of case-insensitive regex non-match.
847 : */
848 : Datum
849 8 : icregexnesel(PG_FUNCTION_ARGS)
850 : {
851 8 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
852 : }
853 :
854 : /*
855 : * nlikesel - Selectivity of LIKE pattern non-match.
856 : */
857 : Datum
858 65 : nlikesel(PG_FUNCTION_ARGS)
859 : {
860 65 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
861 : }
862 :
863 : /*
864 : * icnlikesel - Selectivity of ILIKE pattern non-match.
865 : */
866 : Datum
867 4 : icnlikesel(PG_FUNCTION_ARGS)
868 : {
869 4 : PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
870 : }
871 :
872 : /*
873 : * patternjoinsel - Generic code for pattern-match join selectivity.
874 : */
875 : static double
876 91 : patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
877 : {
878 : /* For the moment we just punt. */
879 91 : return negate ? (1.0 - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
880 : }
881 :
882 : /*
883 : * regexeqjoinsel - Join selectivity of regular-expression pattern match.
884 : */
885 : Datum
886 91 : regexeqjoinsel(PG_FUNCTION_ARGS)
887 : {
888 91 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
889 : }
890 :
891 : /*
892 : * icregexeqjoinsel - Join selectivity of case-insensitive regex match.
893 : */
894 : Datum
1515 tgl 895 UBC 0 : icregexeqjoinsel(PG_FUNCTION_ARGS)
896 : {
897 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
898 : }
899 :
900 : /*
901 : * likejoinsel - Join selectivity of LIKE pattern match.
902 : */
903 : Datum
904 0 : likejoinsel(PG_FUNCTION_ARGS)
905 : {
906 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
907 : }
908 :
909 : /*
910 : * prefixjoinsel - Join selectivity of prefix operator
911 : */
912 : Datum
913 0 : prefixjoinsel(PG_FUNCTION_ARGS)
914 : {
915 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
916 : }
917 :
918 : /*
919 : * iclikejoinsel - Join selectivity of ILIKE pattern match.
920 : */
921 : Datum
922 0 : iclikejoinsel(PG_FUNCTION_ARGS)
923 : {
924 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
925 : }
926 :
927 : /*
928 : * regexnejoinsel - Join selectivity of regex non-match.
929 : */
930 : Datum
931 0 : regexnejoinsel(PG_FUNCTION_ARGS)
932 : {
933 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
934 : }
935 :
936 : /*
937 : * icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
938 : */
939 : Datum
940 0 : icregexnejoinsel(PG_FUNCTION_ARGS)
941 : {
942 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
943 : }
944 :
945 : /*
946 : * nlikejoinsel - Join selectivity of LIKE pattern non-match.
947 : */
948 : Datum
949 0 : nlikejoinsel(PG_FUNCTION_ARGS)
950 : {
951 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
952 : }
953 :
954 : /*
955 : * icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
956 : */
957 : Datum
958 0 : icnlikejoinsel(PG_FUNCTION_ARGS)
959 : {
960 0 : PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
961 : }
962 :
963 :
964 : /*-------------------------------------------------------------------------
965 : *
966 : * Pattern analysis functions
967 : *
968 : * These routines support analysis of LIKE and regular-expression patterns
969 : * by the planner/optimizer. It's important that they agree with the
970 : * regular-expression code in backend/regex/ and the LIKE code in
971 : * backend/utils/adt/like.c. Also, the computation of the fixed prefix
972 : * must be conservative: if we report a string longer than the true fixed
973 : * prefix, the query may produce actually wrong answers, rather than just
974 : * getting a bad selectivity estimate!
975 : *
976 : *-------------------------------------------------------------------------
977 : */
978 :
979 : /*
980 : * Extract the fixed prefix, if any, for a pattern.
981 : *
982 : * *prefix is set to a palloc'd prefix string (in the form of a Const node),
983 : * or to NULL if no fixed prefix exists for the pattern.
984 : * If rest_selec is not NULL, *rest_selec is set to an estimate of the
985 : * selectivity of the remainder of the pattern (without any fixed prefix).
986 : * The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
987 : *
988 : * The return value distinguishes no fixed prefix, a partial prefix,
989 : * or an exact-match-only pattern.
990 : */
991 :
992 : static Pattern_Prefix_Status
1515 tgl 993 CBC 1711 : like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
994 : Const **prefix_const, Selectivity *rest_selec)
995 : {
996 : char *match;
997 : char *patt;
998 : int pattlen;
999 1711 : Oid typeid = patt_const->consttype;
1000 : int pos,
1001 : match_pos;
1002 1711 : bool is_multibyte = (pg_database_encoding_max_length() > 1);
1003 1711 : pg_locale_t locale = 0;
1004 1711 : bool locale_is_c = false;
1005 :
1006 : /* the right-hand const is type text or bytea */
1007 1711 : Assert(typeid == BYTEAOID || typeid == TEXTOID);
1008 :
1009 1711 : if (case_insensitive)
1010 : {
1011 134 : if (typeid == BYTEAOID)
1515 tgl 1012 UBC 0 : ereport(ERROR,
1013 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1014 : errmsg("case insensitive matching not supported on type bytea")));
1015 :
444 peter 1016 CBC 134 : if (!OidIsValid(collation))
1017 : {
1018 : /*
1019 : * This typically means that the parser could not resolve a
1020 : * conflict of implicit collations, so report it that way.
1021 : */
444 peter 1022 UBC 0 : ereport(ERROR,
1023 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1024 : errmsg("could not determine which collation to use for ILIKE"),
1025 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1026 : }
1027 :
1028 : /* If case-insensitive, we need locale info */
1515 tgl 1029 CBC 134 : if (lc_ctype_is_c(collation))
1030 58 : locale_is_c = true;
1031 : else
1032 76 : locale = pg_newlocale_from_collation(collation);
1033 : }
1034 :
1035 1711 : if (typeid != BYTEAOID)
1036 : {
1037 1705 : patt = TextDatumGetCString(patt_const->constvalue);
1038 1705 : pattlen = strlen(patt);
1039 : }
1040 : else
1041 : {
1042 6 : bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
1043 :
1044 6 : pattlen = VARSIZE_ANY_EXHDR(bstr);
1045 6 : patt = (char *) palloc(pattlen);
1046 6 : memcpy(patt, VARDATA_ANY(bstr), pattlen);
1047 6 : Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
1048 : }
1049 :
1050 1711 : match = palloc(pattlen + 1);
1051 1711 : match_pos = 0;
1052 10611 : for (pos = 0; pos < pattlen; pos++)
1053 : {
1054 : /* % and _ are wildcard characters in LIKE */
1055 10564 : if (patt[pos] == '%' ||
1056 9489 : patt[pos] == '_')
1057 : break;
1058 :
1059 : /* Backslash escapes the next character */
1060 8999 : if (patt[pos] == '\\')
1061 : {
1062 131 : pos++;
1063 131 : if (pos >= pattlen)
1515 tgl 1064 UBC 0 : break;
1065 : }
1066 :
1067 : /* Stop if case-varying character (it's sort of a wildcard) */
1515 tgl 1068 CBC 9146 : if (case_insensitive &&
1069 147 : pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1070 99 : break;
1071 :
1072 8900 : match[match_pos++] = patt[pos];
1073 : }
1074 :
1075 1711 : match[match_pos] = '\0';
1076 :
1077 1711 : if (typeid != BYTEAOID)
1078 1705 : *prefix_const = string_to_const(match, typeid);
1079 : else
1080 6 : *prefix_const = string_to_bytea_const(match, match_pos);
1081 :
1082 1711 : if (rest_selec != NULL)
1083 1144 : *rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1084 : case_insensitive);
1085 :
1086 1711 : pfree(patt);
1087 1711 : pfree(match);
1088 :
1089 : /* in LIKE, an empty pattern is an exact match! */
1090 1711 : if (pos == pattlen)
1091 47 : return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1092 :
1093 1664 : if (match_pos > 0)
1094 1387 : return Pattern_Prefix_Partial;
1095 :
1096 277 : return Pattern_Prefix_None;
1097 : }
1098 :
1099 : static Pattern_Prefix_Status
1100 6461 : regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1101 : Const **prefix_const, Selectivity *rest_selec)
1102 : {
1103 6461 : Oid typeid = patt_const->consttype;
1104 : char *prefix;
1105 : bool exact;
1106 :
1107 : /*
1108 : * Should be unnecessary, there are no bytea regex operators defined. As
1109 : * such, it should be noted that the rest of this function has *not* been
1110 : * made safe for binary (possibly NULL containing) strings.
1111 : */
1112 6461 : if (typeid == BYTEAOID)
1515 tgl 1113 UBC 0 : ereport(ERROR,
1114 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1115 : errmsg("regular-expression matching not supported on type bytea")));
1116 :
1117 : /* Use the regexp machinery to extract the prefix, if any */
1515 tgl 1118 CBC 6461 : prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1119 : case_insensitive, collation,
1120 : &exact);
1121 :
1122 6449 : if (prefix == NULL)
1123 : {
1124 364 : *prefix_const = NULL;
1125 :
1126 364 : if (rest_selec != NULL)
1127 : {
1128 300 : char *patt = TextDatumGetCString(patt_const->constvalue);
1129 :
1130 300 : *rest_selec = regex_selectivity(patt, strlen(patt),
1131 : case_insensitive,
1132 : 0);
1133 300 : pfree(patt);
1134 : }
1135 :
1136 364 : return Pattern_Prefix_None;
1137 : }
1138 :
1139 6085 : *prefix_const = string_to_const(prefix, typeid);
1140 :
1141 6085 : if (rest_selec != NULL)
1142 : {
1143 3250 : if (exact)
1144 : {
1145 : /* Exact match, so there's no additional selectivity */
1146 2767 : *rest_selec = 1.0;
1147 : }
1148 : else
1149 : {
1150 483 : char *patt = TextDatumGetCString(patt_const->constvalue);
1151 :
1152 966 : *rest_selec = regex_selectivity(patt, strlen(patt),
1153 : case_insensitive,
1154 483 : strlen(prefix));
1155 483 : pfree(patt);
1156 : }
1157 : }
1158 :
1159 6085 : pfree(prefix);
1160 :
1161 6085 : if (exact)
1162 5459 : return Pattern_Prefix_Exact; /* pattern specifies exact match */
1163 : else
1164 626 : return Pattern_Prefix_Partial;
1165 : }
1166 :
1167 : static Pattern_Prefix_Status
1168 8223 : pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1169 : Const **prefix, Selectivity *rest_selec)
1170 : {
1171 : Pattern_Prefix_Status result;
1172 :
1173 8223 : switch (ptype)
1174 : {
1175 1577 : case Pattern_Type_Like:
1176 1577 : result = like_fixed_prefix(patt, false, collation,
1177 : prefix, rest_selec);
1178 1577 : break;
1179 134 : case Pattern_Type_Like_IC:
1180 134 : result = like_fixed_prefix(patt, true, collation,
1181 : prefix, rest_selec);
1182 134 : break;
1183 6415 : case Pattern_Type_Regex:
1184 6415 : result = regex_fixed_prefix(patt, false, collation,
1185 : prefix, rest_selec);
1186 6403 : break;
1187 46 : case Pattern_Type_Regex_IC:
1188 46 : result = regex_fixed_prefix(patt, true, collation,
1189 : prefix, rest_selec);
1190 46 : break;
1191 51 : case Pattern_Type_Prefix:
1192 : /* Prefix type work is trivial. */
1193 51 : result = Pattern_Prefix_Partial;
1194 51 : *prefix = makeConst(patt->consttype,
1195 : patt->consttypmod,
1196 : patt->constcollid,
1197 : patt->constlen,
1198 : datumCopy(patt->constvalue,
1199 51 : patt->constbyval,
1200 : patt->constlen),
1201 51 : patt->constisnull,
1202 51 : patt->constbyval);
508 1203 51 : if (rest_selec != NULL)
1204 39 : *rest_selec = 1.0; /* all */
1515 1205 51 : break;
1515 tgl 1206 UBC 0 : default:
1207 0 : elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1208 : result = Pattern_Prefix_None; /* keep compiler quiet */
1209 : break;
1210 : }
1515 tgl 1211 CBC 8211 : return result;
1212 : }
1213 :
1214 : /*
1215 : * Estimate the selectivity of a fixed prefix for a pattern match.
1216 : *
1217 : * A fixed prefix "foo" is estimated as the selectivity of the expression
1218 : * "variable >= 'foo' AND variable < 'fop'".
1219 : *
1220 : * The selectivity estimate is with respect to the portion of the column
1221 : * population represented by the histogram --- the caller must fold this
1222 : * together with info about MCVs and NULLs.
1223 : *
1224 : * We use the given comparison operators and collation to do the estimation.
1225 : * The given variable and Const must be of the associated datatype(s).
1226 : *
1227 : * XXX Note: we make use of the upper bound to estimate operator selectivity
1228 : * even if the locale is such that we cannot rely on the upper-bound string.
1229 : * The selectivity only needs to be approximately right anyway, so it seems
1230 : * more useful to use the upper-bound code than not.
1231 : */
1232 : static Selectivity
1233 923 : prefix_selectivity(PlannerInfo *root, VariableStatData *vardata,
1234 : Oid eqopr, Oid ltopr, Oid geopr,
1235 : Oid collation,
1236 : Const *prefixcon)
1237 : {
1238 : Selectivity prefixsel;
1239 : FmgrInfo opproc;
1240 : Const *greaterstrcon;
1241 : Selectivity eq_sel;
1242 :
1243 : /* Estimate the selectivity of "x >= prefix" */
1236 1244 923 : fmgr_info(get_opcode(geopr), &opproc);
1245 :
1515 1246 923 : prefixsel = ineq_histogram_selectivity(root, vardata,
1247 : geopr, &opproc, true, true,
1248 : collation,
1249 : prefixcon->constvalue,
1250 : prefixcon->consttype);
1251 :
1252 923 : if (prefixsel < 0.0)
1253 : {
1254 : /* No histogram is present ... return a suitable default estimate */
1255 534 : return DEFAULT_MATCH_SEL;
1256 : }
1257 :
1258 : /*
1259 : * If we can create a string larger than the prefix, say "x < greaterstr".
1260 : */
1236 1261 389 : fmgr_info(get_opcode(ltopr), &opproc);
1038 1262 389 : greaterstrcon = make_greater_string(prefixcon, &opproc, collation);
1515 1263 389 : if (greaterstrcon)
1264 : {
1265 : Selectivity topsel;
1266 :
1267 389 : topsel = ineq_histogram_selectivity(root, vardata,
1268 : ltopr, &opproc, false, false,
1269 : collation,
1270 : greaterstrcon->constvalue,
1271 : greaterstrcon->consttype);
1272 :
1273 : /* ineq_histogram_selectivity worked before, it shouldn't fail now */
1274 389 : Assert(topsel >= 0.0);
1275 :
1276 : /*
1277 : * Merge the two selectivities in the same way as for a range query
1278 : * (see clauselist_selectivity()). Note that we don't need to worry
1279 : * about double-exclusion of nulls, since ineq_histogram_selectivity
1280 : * doesn't count those anyway.
1281 : */
1282 389 : prefixsel = topsel + prefixsel - 1.0;
1283 : }
1284 :
1285 : /*
1286 : * If the prefix is long then the two bounding values might be too close
1287 : * together for the histogram to distinguish them usefully, resulting in a
1288 : * zero estimate (plus or minus roundoff error). To avoid returning a
1289 : * ridiculously small estimate, compute the estimated selectivity for
1290 : * "variable = 'foo'", and clamp to that. (Obviously, the resultant
1291 : * estimate should be at least that.)
1292 : *
1293 : * We apply this even if we couldn't make a greater string. That case
1294 : * suggests that the prefix is near the maximum possible, and thus
1295 : * probably off the end of the histogram, and thus we probably got a very
1296 : * small estimate from the >= condition; so we still need to clamp.
1297 : */
1038 1298 389 : eq_sel = var_eq_const(vardata, eqopr, collation, prefixcon->constvalue,
1299 : false, true, false);
1300 :
1515 1301 389 : prefixsel = Max(prefixsel, eq_sel);
1302 :
1303 389 : return prefixsel;
1304 : }
1305 :
1306 :
1307 : /*
1308 : * Estimate the selectivity of a pattern of the specified type.
1309 : * Note that any fixed prefix of the pattern will have been removed already,
1310 : * so actually we may be looking at just a fragment of the pattern.
1311 : *
1312 : * For now, we use a very simplistic approach: fixed characters reduce the
1313 : * selectivity a good deal, character ranges reduce it a little,
1314 : * wildcards (such as % for LIKE or .* for regex) increase it.
1315 : */
1316 :
1317 : #define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1318 : #define CHAR_RANGE_SEL 0.25
1319 : #define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1320 : #define FULL_WILDCARD_SEL 5.0
1321 : #define PARTIAL_WILDCARD_SEL 2.0
1322 :
1323 : static Selectivity
1324 1144 : like_selectivity(const char *patt, int pattlen, bool case_insensitive)
1325 : {
1326 1144 : Selectivity sel = 1.0;
1327 : int pos;
1328 :
1329 : /* Skip any leading wildcard; it's already factored into initial sel */
1330 2219 : for (pos = 0; pos < pattlen; pos++)
1331 : {
1332 1482 : if (patt[pos] != '%' && patt[pos] != '_')
1333 407 : break;
1334 : }
1335 :
1336 3363 : for (; pos < pattlen; pos++)
1337 : {
1338 : /* % and _ are wildcard characters in LIKE */
1339 2219 : if (patt[pos] == '%')
1340 355 : sel *= FULL_WILDCARD_SEL;
1341 1864 : else if (patt[pos] == '_')
1342 76 : sel *= ANY_CHAR_SEL;
1343 1788 : else if (patt[pos] == '\\')
1344 : {
1345 : /* Backslash quotes the next character */
1346 20 : pos++;
1347 20 : if (pos >= pattlen)
1515 tgl 1348 UBC 0 : break;
1515 tgl 1349 CBC 20 : sel *= FIXED_CHAR_SEL;
1350 : }
1351 : else
1352 1768 : sel *= FIXED_CHAR_SEL;
1353 : }
1354 : /* Could get sel > 1 if multiple wildcards */
1355 1144 : if (sel > 1.0)
1515 tgl 1356 UBC 0 : sel = 1.0;
1515 tgl 1357 CBC 1144 : return sel;
1358 : }
1359 :
1360 : static Selectivity
1361 926 : regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
1362 : {
1363 926 : Selectivity sel = 1.0;
1364 926 : int paren_depth = 0;
1365 926 : int paren_pos = 0; /* dummy init to keep compiler quiet */
1366 : int pos;
1367 :
1368 : /* since this function recurses, it could be driven to stack overflow */
228 1369 926 : check_stack_depth();
1370 :
1515 1371 8978 : for (pos = 0; pos < pattlen; pos++)
1372 : {
1373 8061 : if (patt[pos] == '(')
1374 : {
1375 137 : if (paren_depth == 0)
1376 134 : paren_pos = pos; /* remember start of parenthesized item */
1377 137 : paren_depth++;
1378 : }
1379 7924 : else if (patt[pos] == ')' && paren_depth > 0)
1380 : {
1381 137 : paren_depth--;
1382 137 : if (paren_depth == 0)
1383 134 : sel *= regex_selectivity_sub(patt + (paren_pos + 1),
1384 134 : pos - (paren_pos + 1),
1385 : case_insensitive);
1386 : }
1387 7787 : else if (patt[pos] == '|' && paren_depth == 0)
1388 : {
1389 : /*
1390 : * If unquoted | is present at paren level 0 in pattern, we have
1391 : * multiple alternatives; sum their probabilities.
1392 : */
1393 18 : sel += regex_selectivity_sub(patt + (pos + 1),
1394 9 : pattlen - (pos + 1),
1395 : case_insensitive);
1396 9 : break; /* rest of pattern is now processed */
1397 : }
1398 7778 : else if (patt[pos] == '[')
1399 : {
1400 38 : bool negclass = false;
1401 :
1402 38 : if (patt[++pos] == '^')
1403 : {
1515 tgl 1404 UBC 0 : negclass = true;
1405 0 : pos++;
1406 : }
1515 tgl 1407 CBC 38 : if (patt[pos] == ']') /* ']' at start of class is not special */
1515 tgl 1408 UBC 0 : pos++;
1515 tgl 1409 CBC 186 : while (pos < pattlen && patt[pos] != ']')
1410 148 : pos++;
1411 38 : if (paren_depth == 0)
1412 38 : sel *= (negclass ? (1.0 - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1413 : }
1414 7740 : else if (patt[pos] == '.')
1415 : {
1416 323 : if (paren_depth == 0)
1417 193 : sel *= ANY_CHAR_SEL;
1418 : }
1419 7417 : else if (patt[pos] == '*' ||
1420 7134 : patt[pos] == '?' ||
1421 7108 : patt[pos] == '+')
1422 : {
1423 : /* Ought to be smarter about quantifiers... */
1424 316 : if (paren_depth == 0)
1425 181 : sel *= PARTIAL_WILDCARD_SEL;
1426 : }
1427 7101 : else if (patt[pos] == '{')
1428 : {
1429 132 : while (pos < pattlen && patt[pos] != '}')
1430 94 : pos++;
1431 38 : if (paren_depth == 0)
1432 32 : sel *= PARTIAL_WILDCARD_SEL;
1433 : }
1434 7063 : else if (patt[pos] == '\\')
1435 : {
1436 : /* backslash quotes the next character */
1437 100 : pos++;
1438 100 : if (pos >= pattlen)
1515 tgl 1439 UBC 0 : break;
1515 tgl 1440 CBC 100 : if (paren_depth == 0)
1441 52 : sel *= FIXED_CHAR_SEL;
1442 : }
1443 : else
1444 : {
1445 6963 : if (paren_depth == 0)
1446 6056 : sel *= FIXED_CHAR_SEL;
1447 : }
1448 : }
1449 : /* Could get sel > 1 if multiple wildcards */
1450 926 : if (sel > 1.0)
1451 13 : sel = 1.0;
1452 926 : return sel;
1453 : }
1454 :
1455 : static Selectivity
1456 783 : regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
1457 : int fixed_prefix_len)
1458 : {
1459 : Selectivity sel;
1460 :
1461 : /* If patt doesn't end with $, consider it to have a trailing wildcard */
1462 783 : if (pattlen > 0 && patt[pattlen - 1] == '$' &&
1463 137 : (pattlen == 1 || patt[pattlen - 2] != '\\'))
1464 : {
1465 : /* has trailing $ */
1466 137 : sel = regex_selectivity_sub(patt, pattlen - 1, case_insensitive);
1467 : }
1468 : else
1469 : {
1470 : /* no trailing $ */
1471 646 : sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1472 646 : sel *= FULL_WILDCARD_SEL;
1473 : }
1474 :
1475 : /*
1476 : * If there's a fixed prefix, discount its selectivity. We have to be
1477 : * careful here since a very long prefix could result in pow's result
1478 : * underflowing to zero (in which case "sel" probably has as well).
1479 : */
1480 783 : if (fixed_prefix_len > 0)
1481 : {
786 1482 483 : double prefixsel = pow(FIXED_CHAR_SEL, fixed_prefix_len);
1483 :
1484 483 : if (prefixsel > 0.0)
1485 483 : sel /= prefixsel;
1486 : }
1487 :
1488 : /* Make sure result stays in range */
1515 1489 783 : CLAMP_PROBABILITY(sel);
1490 783 : return sel;
1491 : }
1492 :
1493 : /*
1494 : * Check whether char is a letter (and, hence, subject to case-folding)
1495 : *
1496 : * In multibyte character sets or with ICU, we can't use isalpha, and it does
1497 : * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1498 : * Instead, just assume any non-ASCII char is potentially case-varying, and
1499 : * hard-wire knowledge of which ASCII chars are letters.
1500 : */
1501 : static int
1502 147 : pattern_char_isalpha(char c, bool is_multibyte,
1503 : pg_locale_t locale, bool locale_is_c)
1504 : {
1505 147 : if (locale_is_c)
1506 78 : return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1507 69 : else if (is_multibyte && IS_HIGHBIT_SET(c))
1515 tgl 1508 UBC 0 : return true;
1515 tgl 1509 CBC 69 : else if (locale && locale->provider == COLLPROVIDER_ICU)
1336 1510 63 : return IS_HIGHBIT_SET(c) ||
1511 126 : (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1512 : #ifdef HAVE_LOCALE_T
1515 1513 6 : else if (locale && locale->provider == COLLPROVIDER_LIBC)
1514 6 : return isalpha_l((unsigned char) c, locale->info.lt);
1515 : #endif
1516 : else
1515 tgl 1517 UBC 0 : return isalpha((unsigned char) c);
1518 : }
1519 :
1520 :
1521 : /*
1522 : * For bytea, the increment function need only increment the current byte
1523 : * (there are no multibyte characters to worry about).
1524 : */
1525 : static bool
1526 0 : byte_increment(unsigned char *ptr, int len)
1527 : {
1528 0 : if (*ptr >= 255)
1529 0 : return false;
1530 0 : (*ptr)++;
1531 0 : return true;
1532 : }
1533 :
1534 : /*
1535 : * Try to generate a string greater than the given string or any
1536 : * string it is a prefix of. If successful, return a palloc'd string
1537 : * in the form of a Const node; else return NULL.
1538 : *
1539 : * The caller must provide the appropriate "less than" comparison function
1540 : * for testing the strings, along with the collation to use.
1541 : *
1542 : * The key requirement here is that given a prefix string, say "foo",
1543 : * we must be able to generate another string "fop" that is greater than
1544 : * all strings "foobar" starting with "foo". We can test that we have
1545 : * generated a string greater than the prefix string, but in non-C collations
1546 : * that is not a bulletproof guarantee that an extension of the string might
1547 : * not sort after it; an example is that "foo " is less than "foo!", but it
1548 : * is not clear that a "dictionary" sort ordering will consider "foo!" less
1549 : * than "foo bar". CAUTION: Therefore, this function should be used only for
1550 : * estimation purposes when working in a non-C collation.
1551 : *
1552 : * To try to catch most cases where an extended string might otherwise sort
1553 : * before the result value, we determine which of the strings "Z", "z", "y",
1554 : * and "9" is seen as largest by the collation, and append that to the given
1555 : * prefix before trying to find a string that compares as larger.
1556 : *
1557 : * To search for a greater string, we repeatedly "increment" the rightmost
1558 : * character, using an encoding-specific character incrementer function.
1559 : * When it's no longer possible to increment the last character, we truncate
1560 : * off that character and start incrementing the next-to-rightmost.
1561 : * For example, if "z" were the last character in the sort order, then we
1562 : * could produce "foo" as a string greater than "fonz".
1563 : *
1564 : * This could be rather slow in the worst case, but in most cases we
1565 : * won't have to try more than one or two strings before succeeding.
1566 : *
1567 : * Note that it's important for the character incrementer not to be too anal
1568 : * about producing every possible character code, since in some cases the only
1569 : * way to get a larger string is to increment a previous character position.
1570 : * So we don't want to spend too much time trying every possible character
1571 : * code at the last position. A good rule of thumb is to be sure that we
1572 : * don't try more than 256*K values for a K-byte character (and definitely
1573 : * not 256^K, which is what an exhaustive search would approach).
1574 : */
1575 : static Const *
1515 tgl 1576 CBC 980 : make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
1577 : {
1578 980 : Oid datatype = str_const->consttype;
1579 : char *workstr;
1580 : int len;
1581 : Datum cmpstr;
1582 980 : char *cmptxt = NULL;
1583 : mbcharacter_incrementer charinc;
1584 :
1585 : /*
1586 : * Get a modifiable copy of the prefix string in C-string format, and set
1587 : * up the string we will compare to as a Datum. In C locale this can just
1588 : * be the given prefix string, otherwise we need to add a suffix. Type
1589 : * BYTEA sorts bytewise so it never needs a suffix either.
1590 : */
1591 980 : if (datatype == BYTEAOID)
1592 : {
1515 tgl 1593 UBC 0 : bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1594 :
1595 0 : len = VARSIZE_ANY_EXHDR(bstr);
1596 0 : workstr = (char *) palloc(len);
1597 0 : memcpy(workstr, VARDATA_ANY(bstr), len);
1598 0 : Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1599 0 : cmpstr = str_const->constvalue;
1600 : }
1601 : else
1602 : {
1515 tgl 1603 CBC 980 : if (datatype == NAMEOID)
1515 tgl 1604 UBC 0 : workstr = DatumGetCString(DirectFunctionCall1(nameout,
1605 : str_const->constvalue));
1606 : else
1515 tgl 1607 CBC 980 : workstr = TextDatumGetCString(str_const->constvalue);
1608 980 : len = strlen(workstr);
1609 980 : if (lc_collate_is_c(collation) || len == 0)
1610 967 : cmpstr = str_const->constvalue;
1611 : else
1612 : {
1613 : /* If first time through, determine the suffix to use */
1614 : static char suffixchar = 0;
1615 : static Oid suffixcollation = 0;
1616 :
1617 13 : if (!suffixchar || suffixcollation != collation)
1618 : {
1619 : char *best;
1620 :
1621 4 : best = "Z";
1622 4 : if (varstr_cmp(best, 1, "z", 1, collation) < 0)
1623 3 : best = "z";
1624 4 : if (varstr_cmp(best, 1, "y", 1, collation) < 0)
1515 tgl 1625 UBC 0 : best = "y";
1515 tgl 1626 CBC 4 : if (varstr_cmp(best, 1, "9", 1, collation) < 0)
1515 tgl 1627 UBC 0 : best = "9";
1515 tgl 1628 CBC 4 : suffixchar = *best;
1629 4 : suffixcollation = collation;
1630 : }
1631 :
1632 : /* And build the string to compare to */
1633 13 : if (datatype == NAMEOID)
1634 : {
1515 tgl 1635 UBC 0 : cmptxt = palloc(len + 2);
1636 0 : memcpy(cmptxt, workstr, len);
1637 0 : cmptxt[len] = suffixchar;
1638 0 : cmptxt[len + 1] = '\0';
1639 0 : cmpstr = PointerGetDatum(cmptxt);
1640 : }
1641 : else
1642 : {
1515 tgl 1643 CBC 13 : cmptxt = palloc(VARHDRSZ + len + 1);
1644 13 : SET_VARSIZE(cmptxt, VARHDRSZ + len + 1);
1645 13 : memcpy(VARDATA(cmptxt), workstr, len);
1646 13 : *(VARDATA(cmptxt) + len) = suffixchar;
1647 13 : cmpstr = PointerGetDatum(cmptxt);
1648 : }
1649 : }
1650 : }
1651 :
1652 : /* Select appropriate character-incrementer function */
1653 980 : if (datatype == BYTEAOID)
1515 tgl 1654 UBC 0 : charinc = byte_increment;
1655 : else
1515 tgl 1656 CBC 980 : charinc = pg_database_encoding_character_incrementer();
1657 :
1658 : /* And search ... */
1659 980 : while (len > 0)
1660 : {
1661 : int charlen;
1662 : unsigned char *lastchar;
1663 :
1664 : /* Identify the last character --- for bytea, just the last byte */
1665 980 : if (datatype == BYTEAOID)
1515 tgl 1666 UBC 0 : charlen = 1;
1667 : else
1515 tgl 1668 CBC 980 : charlen = len - pg_mbcliplen(workstr, len, len - 1);
1669 980 : lastchar = (unsigned char *) (workstr + len - charlen);
1670 :
1671 : /*
1672 : * Try to generate a larger string by incrementing the last character
1673 : * (for BYTEA, we treat each byte as a character).
1674 : *
1675 : * Note: the incrementer function is expected to return true if it's
1676 : * generated a valid-per-the-encoding new character, otherwise false.
1677 : * The contents of the character on false return are unspecified.
1678 : */
1679 980 : while (charinc(lastchar, charlen))
1680 : {
1681 : Const *workstr_const;
1682 :
1683 980 : if (datatype == BYTEAOID)
1515 tgl 1684 UBC 0 : workstr_const = string_to_bytea_const(workstr, len);
1685 : else
1515 tgl 1686 CBC 980 : workstr_const = string_to_const(workstr, datatype);
1687 :
1688 980 : if (DatumGetBool(FunctionCall2Coll(ltproc,
1689 : collation,
1690 : cmpstr,
1691 : workstr_const->constvalue)))
1692 : {
1693 : /* Successfully made a string larger than cmpstr */
1694 980 : if (cmptxt)
1695 13 : pfree(cmptxt);
1696 980 : pfree(workstr);
1697 980 : return workstr_const;
1698 : }
1699 :
1700 : /* No good, release unusable value and try again */
1515 tgl 1701 UBC 0 : pfree(DatumGetPointer(workstr_const->constvalue));
1702 0 : pfree(workstr_const);
1703 : }
1704 :
1705 : /*
1706 : * No luck here, so truncate off the last character and try to
1707 : * increment the next one.
1708 : */
1709 0 : len -= charlen;
1710 0 : workstr[len] = '\0';
1711 : }
1712 :
1713 : /* Failed... */
1714 0 : if (cmptxt)
1715 0 : pfree(cmptxt);
1716 0 : pfree(workstr);
1717 :
1718 0 : return NULL;
1719 : }
1720 :
1721 : /*
1722 : * Generate a Datum of the appropriate type from a C string.
1723 : * Note that all of the supported types are pass-by-ref, so the
1724 : * returned value should be pfree'd if no longer needed.
1725 : */
1726 : static Datum
1515 tgl 1727 CBC 8770 : string_to_datum(const char *str, Oid datatype)
1728 : {
1729 8770 : Assert(str != NULL);
1730 :
1731 : /*
1732 : * We cheat a little by assuming that CStringGetTextDatum() will do for
1733 : * bpchar and varchar constants too...
1734 : */
1735 8770 : if (datatype == NAMEOID)
1515 tgl 1736 UBC 0 : return DirectFunctionCall1(namein, CStringGetDatum(str));
1515 tgl 1737 CBC 8770 : else if (datatype == BYTEAOID)
1515 tgl 1738 UBC 0 : return DirectFunctionCall1(byteain, CStringGetDatum(str));
1739 : else
1515 tgl 1740 CBC 8770 : return CStringGetTextDatum(str);
1741 : }
1742 :
1743 : /*
1744 : * Generate a Const node of the appropriate type from a C string.
1745 : */
1746 : static Const *
1747 8770 : string_to_const(const char *str, Oid datatype)
1748 : {
1749 8770 : Datum conval = string_to_datum(str, datatype);
1750 : Oid collation;
1751 : int constlen;
1752 :
1753 : /*
1754 : * We only need to support a few datatypes here, so hard-wire properties
1755 : * instead of incurring the expense of catalog lookups.
1756 : */
1757 8770 : switch (datatype)
1758 : {
1759 8770 : case TEXTOID:
1760 : case VARCHAROID:
1761 : case BPCHAROID:
1762 8770 : collation = DEFAULT_COLLATION_OID;
1763 8770 : constlen = -1;
1764 8770 : break;
1765 :
1515 tgl 1766 UBC 0 : case NAMEOID:
1767 0 : collation = C_COLLATION_OID;
1768 0 : constlen = NAMEDATALEN;
1769 0 : break;
1770 :
1771 0 : case BYTEAOID:
1772 0 : collation = InvalidOid;
1773 0 : constlen = -1;
1774 0 : break;
1775 :
1776 0 : default:
1777 0 : elog(ERROR, "unexpected datatype in string_to_const: %u",
1778 : datatype);
1779 : return NULL;
1780 : }
1781 :
1515 tgl 1782 CBC 8770 : return makeConst(datatype, -1, collation, constlen,
1783 : conval, false, false);
1784 : }
1785 :
1786 : /*
1787 : * Generate a Const node of bytea type from a binary C string and a length.
1788 : */
1789 : static Const *
1790 6 : string_to_bytea_const(const char *str, size_t str_len)
1791 : {
1792 6 : bytea *bstr = palloc(VARHDRSZ + str_len);
1793 : Datum conval;
1794 :
1795 6 : memcpy(VARDATA(bstr), str, str_len);
1796 6 : SET_VARSIZE(bstr, VARHDRSZ + str_len);
1797 6 : conval = PointerGetDatum(bstr);
1798 :
1799 6 : return makeConst(BYTEAOID, -1, InvalidOid, -1, conval, false, false);
1800 : }
|