TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * multirangetypes_selfuncs.c
4 : * Functions for selectivity estimation of multirange operators
5 : *
6 : * Estimates are based on histograms of lower and upper bounds, and the
7 : * fraction of empty multiranges.
8 : *
9 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : *
13 : * IDENTIFICATION
14 : * src/backend/utils/adt/multirangetypes_selfuncs.c
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include <math.h>
21 :
22 : #include "access/htup_details.h"
23 : #include "catalog/pg_operator.h"
24 : #include "catalog/pg_statistic.h"
25 : #include "catalog/pg_type.h"
26 : #include "utils/float.h"
27 : #include "utils/fmgrprotos.h"
28 : #include "utils/lsyscache.h"
29 : #include "utils/rangetypes.h"
30 : #include "utils/multirangetypes.h"
31 : #include "utils/selfuncs.h"
32 : #include "utils/typcache.h"
33 :
34 : static double calc_multirangesel(TypeCacheEntry *typcache,
35 : VariableStatData *vardata,
36 : const MultirangeType *constval, Oid operator);
37 : static double default_multirange_selectivity(Oid operator);
38 : static double default_multirange_selectivity(Oid operator);
39 : static double calc_hist_selectivity(TypeCacheEntry *typcache,
40 : VariableStatData *vardata,
41 : const MultirangeType *constval,
42 : Oid operator);
43 : static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
44 : const RangeBound *constbound,
45 : const RangeBound *hist,
46 : int hist_nvalues, bool equal);
47 : static int rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value,
48 : const RangeBound *hist, int hist_length, bool equal);
49 : static float8 get_position(TypeCacheEntry *typcache, const RangeBound *value,
50 : const RangeBound *hist1, const RangeBound *hist2);
51 : static float8 get_len_position(double value, double hist1, double hist2);
52 : static float8 get_distance(TypeCacheEntry *typcache, const RangeBound *bound1,
53 : const RangeBound *bound2);
54 : static int length_hist_bsearch(Datum *length_hist_values,
55 : int length_hist_nvalues, double value,
56 : bool equal);
57 : static double calc_length_hist_frac(Datum *length_hist_values,
58 : int length_hist_nvalues, double length1,
59 : double length2, bool equal);
60 : static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
61 : const RangeBound *lower,
62 : RangeBound *upper,
63 : const RangeBound *hist_lower,
64 : int hist_nvalues,
65 : Datum *length_hist_values,
66 : int length_hist_nvalues);
67 : static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
68 : const RangeBound *lower,
69 : const RangeBound *upper,
70 : const RangeBound *hist_lower,
71 : int hist_nvalues,
72 : Datum *length_hist_values,
73 : int length_hist_nvalues);
74 :
75 : /*
76 : * Returns a default selectivity estimate for given operator, when we don't
77 : * have statistics or cannot use them for some reason.
78 : */
79 : static double
80 CBC 96 : default_multirange_selectivity(Oid operator)
81 : {
82 96 : switch (operator)
83 : {
84 9 : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
85 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
86 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
87 9 : return 0.01;
88 :
89 27 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
90 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
91 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
92 : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
93 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
94 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
95 27 : return 0.005;
96 :
97 UBC 0 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
98 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
99 :
100 : /*
101 : * "multirange @> elem" is more or less identical to a scalar
102 : * inequality "A >= b AND A <= c".
103 : */
104 0 : return DEFAULT_MULTIRANGE_INEQ_SEL;
105 :
106 CBC 60 : case OID_MULTIRANGE_LESS_OP:
107 : case OID_MULTIRANGE_LESS_EQUAL_OP:
108 : case OID_MULTIRANGE_GREATER_OP:
109 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
110 : case OID_MULTIRANGE_LEFT_RANGE_OP:
111 : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
112 : case OID_RANGE_LEFT_MULTIRANGE_OP:
113 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
114 : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
115 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
116 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
117 : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
118 : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
119 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
120 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
121 : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
122 : /* these are similar to regular scalar inequalities */
123 60 : return DEFAULT_INEQ_SEL;
124 :
125 UBC 0 : default:
126 :
127 : /*
128 : * all multirange operators should be handled above, but just in
129 : * case
130 : */
131 0 : return 0.01;
132 : }
133 : }
134 :
135 : /*
136 : * multirangesel -- restriction selectivity for multirange operators
137 : */
138 : Datum
139 CBC 333 : multirangesel(PG_FUNCTION_ARGS)
140 : {
141 333 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
142 333 : Oid operator = PG_GETARG_OID(1);
143 333 : List *args = (List *) PG_GETARG_POINTER(2);
144 333 : int varRelid = PG_GETARG_INT32(3);
145 : VariableStatData vardata;
146 : Node *other;
147 : bool varonleft;
148 : Selectivity selec;
149 333 : TypeCacheEntry *typcache = NULL;
150 333 : MultirangeType *constmultirange = NULL;
151 333 : RangeType *constrange = NULL;
152 :
153 : /*
154 : * If expression is not (variable op something) or (something op
155 : * variable), then punt and return a default estimate.
156 : */
157 333 : if (!get_restriction_variable(root, args, varRelid,
158 : &vardata, &other, &varonleft))
159 UBC 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
160 :
161 : /*
162 : * Can't do anything useful if the something is not a constant, either.
163 : */
164 CBC 333 : if (!IsA(other, Const))
165 : {
166 UBC 0 : ReleaseVariableStats(vardata);
167 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
168 : }
169 :
170 : /*
171 : * All the multirange operators are strict, so we can cope with a NULL
172 : * constant right away.
173 : */
174 CBC 333 : if (((Const *) other)->constisnull)
175 : {
176 UBC 0 : ReleaseVariableStats(vardata);
177 0 : PG_RETURN_FLOAT8(0.0);
178 : }
179 :
180 : /*
181 : * If var is on the right, commute the operator, so that we can assume the
182 : * var is on the left in what follows.
183 : */
184 CBC 333 : if (!varonleft)
185 : {
186 : /* we have other Op var, commute to make var Op other */
187 12 : operator = get_commutator(operator);
188 12 : if (!operator)
189 : {
190 : /* Use default selectivity (should we raise an error instead?) */
191 UBC 0 : ReleaseVariableStats(vardata);
192 0 : PG_RETURN_FLOAT8(default_multirange_selectivity(operator));
193 : }
194 : }
195 :
196 : /*
197 : * OK, there's a Var and a Const we're dealing with here. We need the
198 : * Const to be of same multirange type as the column, else we can't do
199 : * anything useful. (Such cases will likely fail at runtime, but here we'd
200 : * rather just return a default estimate.)
201 : *
202 : * If the operator is "multirange @> element", the constant should be of
203 : * the element type of the multirange column. Convert it to a multirange
204 : * that includes only that single point, so that we don't need special
205 : * handling for that in what follows.
206 : */
207 CBC 333 : if (operator == OID_MULTIRANGE_CONTAINS_ELEM_OP)
208 : {
209 15 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
210 :
211 15 : if (((Const *) other)->consttype == typcache->rngtype->rngelemtype->type_id)
212 : {
213 : RangeBound lower,
214 : upper;
215 :
216 15 : lower.inclusive = true;
217 15 : lower.val = ((Const *) other)->constvalue;
218 15 : lower.infinite = false;
219 15 : lower.lower = true;
220 15 : upper.inclusive = true;
221 15 : upper.val = ((Const *) other)->constvalue;
222 15 : upper.infinite = false;
223 15 : upper.lower = false;
224 GNC 15 : constrange = range_serialize(typcache->rngtype, &lower, &upper,
225 : false, NULL);
226 GIC 15 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
227 ECB : 1, &constrange);
228 : }
229 : }
230 GIC 318 : else if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
231 CBC 285 : operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
232 267 : operator == OID_MULTIRANGE_OVERLAPS_RANGE_OP ||
233 255 : operator == OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP ||
234 243 : operator == OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP ||
235 231 : operator == OID_MULTIRANGE_LEFT_RANGE_OP ||
236 ECB : operator == OID_MULTIRANGE_RIGHT_RANGE_OP)
237 : {
238 : /*
239 : * Promote a range in "multirange OP range" just like we do an element
240 : * in "multirange OP element".
241 : */
242 GIC 99 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
243 CBC 99 : if (((Const *) other)->consttype == typcache->rngtype->type_id)
244 ECB : {
245 GIC 99 : constrange = DatumGetRangeTypeP(((Const *) other)->constvalue);
246 CBC 99 : constmultirange = make_multirange(typcache->type_id, typcache->rngtype,
247 ECB : 1, &constrange);
248 : }
249 : }
250 GIC 219 : else if (operator == OID_RANGE_OVERLAPS_MULTIRANGE_OP ||
251 CBC 201 : operator == OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP ||
252 192 : operator == OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP ||
253 183 : operator == OID_RANGE_LEFT_MULTIRANGE_OP ||
254 174 : operator == OID_RANGE_RIGHT_MULTIRANGE_OP ||
255 156 : operator == OID_RANGE_CONTAINS_MULTIRANGE_OP ||
256 156 : operator == OID_MULTIRANGE_ELEM_CONTAINED_OP ||
257 ECB : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP)
258 : {
259 : /*
260 : * Here, the Var is the elem/range, not the multirange. For now we
261 : * just punt and return the default estimate. In future we could
262 : * disassemble the multirange constant to do something more
263 : * intelligent.
264 : */
265 : }
266 GIC 147 : else if (((Const *) other)->consttype == vardata.vartype)
267 ECB : {
268 : /* Both sides are the same multirange type */
269 GIC 147 : typcache = multirange_get_typcache(fcinfo, vardata.vartype);
270 ECB :
271 GIC 147 : constmultirange = DatumGetMultirangeTypeP(((Const *) other)->constvalue);
272 ECB : }
273 :
274 : /*
275 : * If we got a valid constant on one side of the operator, proceed to
276 : * estimate using statistics. Otherwise punt and return a default constant
277 : * estimate. Note that calc_multirangesel need not handle
278 : * OID_MULTIRANGE_*_CONTAINED_OP.
279 : */
280 GIC 333 : if (constmultirange)
281 CBC 261 : selec = calc_multirangesel(typcache, &vardata, constmultirange, operator);
282 ECB : else
283 GIC 72 : selec = default_multirange_selectivity(operator);
284 ECB :
285 GIC 333 : ReleaseVariableStats(vardata);
286 ECB :
287 GIC 333 : CLAMP_PROBABILITY(selec);
288 ECB :
289 GIC 333 : PG_RETURN_FLOAT8((float8) selec);
290 ECB : }
291 :
292 : static double
293 GIC 261 : calc_multirangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
294 ECB : const MultirangeType *constval, Oid operator)
295 : {
296 : double hist_selec;
297 : double selec;
298 : float4 empty_frac,
299 : null_frac;
300 :
301 : /*
302 : * First look up the fraction of NULLs and empty multiranges from
303 : * pg_statistic.
304 : */
305 GIC 261 : if (HeapTupleIsValid(vardata->statsTuple))
306 ECB : {
307 : Form_pg_statistic stats;
308 : AttStatsSlot sslot;
309 :
310 GIC 225 : stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
311 CBC 225 : null_frac = stats->stanullfrac;
312 ECB :
313 : /* Try to get fraction of empty multiranges */
314 GIC 225 : if (get_attstatsslot(&sslot, vardata->statsTuple,
315 ECB : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
316 : InvalidOid,
317 : ATTSTATSSLOT_NUMBERS))
318 : {
319 GIC 225 : if (sslot.nnumbers != 1)
320 LBC 0 : elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
321 GBC 225 : empty_frac = sslot.numbers[0];
322 CBC 225 : free_attstatsslot(&sslot);
323 ECB : }
324 : else
325 : {
326 : /* No empty fraction statistic. Assume no empty ranges. */
327 UIC 0 : empty_frac = 0.0;
328 EUB : }
329 : }
330 : else
331 : {
332 : /*
333 : * No stats are available. Follow through the calculations below
334 : * anyway, assuming no NULLs and no empty multiranges. This still
335 : * allows us to give a better-than-nothing estimate based on whether
336 : * the constant is an empty multirange or not.
337 : */
338 GIC 36 : null_frac = 0.0;
339 CBC 36 : empty_frac = 0.0;
340 ECB : }
341 :
342 GIC 261 : if (MultirangeIsEmpty(constval))
343 ECB : {
344 : /*
345 : * An empty multirange matches all multiranges, all empty multiranges,
346 : * or nothing, depending on the operator
347 : */
348 GIC 111 : switch (operator)
349 ECB : {
350 : /* these return false if either argument is empty */
351 GIC 63 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
352 ECB : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
353 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
354 : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
355 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
356 : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
357 : case OID_MULTIRANGE_LEFT_RANGE_OP:
358 : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
359 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
360 : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
361 : /* nothing is less than an empty multirange */
362 : case OID_MULTIRANGE_LESS_OP:
363 GIC 63 : selec = 0.0;
364 CBC 63 : break;
365 ECB :
366 : /*
367 : * only empty multiranges can be contained by an empty
368 : * multirange
369 : */
370 GIC 15 : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
371 ECB : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
372 : /* only empty ranges are <= an empty multirange */
373 : case OID_MULTIRANGE_LESS_EQUAL_OP:
374 GIC 15 : selec = empty_frac;
375 CBC 15 : break;
376 ECB :
377 : /* everything contains an empty multirange */
378 GIC 30 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
379 ECB : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
380 : /* everything is >= an empty multirange */
381 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
382 GIC 30 : selec = 1.0;
383 CBC 30 : break;
384 ECB :
385 : /* all non-empty multiranges are > an empty multirange */
386 GIC 3 : case OID_MULTIRANGE_GREATER_OP:
387 CBC 3 : selec = 1.0 - empty_frac;
388 3 : break;
389 ECB :
390 : /* an element cannot be empty */
391 UIC 0 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
392 EUB :
393 : /* filtered out by multirangesel() */
394 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
395 : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
396 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
397 : case OID_RANGE_LEFT_MULTIRANGE_OP:
398 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
399 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
400 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
401 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
402 :
403 : default:
404 UIC 0 : elog(ERROR, "unexpected operator %u", operator);
405 EUB : selec = 0.0; /* keep compiler quiet */
406 : break;
407 : }
408 : }
409 : else
410 : {
411 : /*
412 : * Calculate selectivity using bound histograms. If that fails for
413 : * some reason, e.g no histogram in pg_statistic, use the default
414 : * constant estimate for the fraction of non-empty values. This is
415 : * still somewhat better than just returning the default estimate,
416 : * because this still takes into account the fraction of empty and
417 : * NULL tuples, if we had statistics for them.
418 : */
419 GIC 150 : hist_selec = calc_hist_selectivity(typcache, vardata, constval,
420 ECB : operator);
421 GIC 150 : if (hist_selec < 0.0)
422 CBC 24 : hist_selec = default_multirange_selectivity(operator);
423 ECB :
424 : /*
425 : * Now merge the results for the empty multiranges and histogram
426 : * calculations, realizing that the histogram covers only the
427 : * non-null, non-empty values.
428 : */
429 GIC 150 : if (operator == OID_RANGE_MULTIRANGE_CONTAINED_OP ||
430 ECB : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
431 : {
432 : /* empty is contained by anything non-empty */
433 GIC 12 : selec = (1.0 - empty_frac) * hist_selec + empty_frac;
434 ECB : }
435 : else
436 : {
437 : /* with any other operator, empty Op non-empty matches nothing */
438 GIC 138 : selec = (1.0 - empty_frac) * hist_selec;
439 ECB : }
440 : }
441 :
442 : /* all multirange operators are strict */
443 GIC 261 : selec *= (1.0 - null_frac);
444 ECB :
445 : /* result should be in range, but make sure... */
446 GIC 261 : CLAMP_PROBABILITY(selec);
447 ECB :
448 GIC 261 : return selec;
449 ECB : }
450 :
451 : /*
452 : * Calculate multirange operator selectivity using histograms of multirange bounds.
453 : *
454 : * This estimate is for the portion of values that are not empty and not
455 : * NULL.
456 : */
457 : static double
458 GIC 150 : calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
459 ECB : const MultirangeType *constval, Oid operator)
460 : {
461 GIC 150 : TypeCacheEntry *rng_typcache = typcache->rngtype;
462 ECB : AttStatsSlot hslot;
463 : AttStatsSlot lslot;
464 : int nhist;
465 : RangeBound *hist_lower;
466 : RangeBound *hist_upper;
467 : int i;
468 : RangeBound const_lower;
469 : RangeBound const_upper;
470 : RangeBound tmp;
471 : double hist_selec;
472 :
473 : /* Can't use the histogram with insecure multirange support functions */
474 GIC 150 : if (!statistic_proc_security_check(vardata,
475 ECB : rng_typcache->rng_cmp_proc_finfo.fn_oid))
476 UIC 0 : return -1;
477 GBC 150 : if (OidIsValid(rng_typcache->rng_subdiff_finfo.fn_oid) &&
478 CBC 150 : !statistic_proc_security_check(vardata,
479 ECB : rng_typcache->rng_subdiff_finfo.fn_oid))
480 UIC 0 : return -1;
481 EUB :
482 : /* Try to get histogram of ranges */
483 GIC 276 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
484 CBC 126 : get_attstatsslot(&hslot, vardata->statsTuple,
485 ECB : STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
486 : ATTSTATSSLOT_VALUES)))
487 GIC 24 : return -1.0;
488 ECB :
489 : /* check that it's a histogram, not just a dummy entry */
490 GIC 126 : if (hslot.nvalues < 2)
491 ECB : {
492 UIC 0 : free_attstatsslot(&hslot);
493 UBC 0 : return -1.0;
494 EUB : }
495 :
496 : /*
497 : * Convert histogram of ranges into histograms of its lower and upper
498 : * bounds.
499 : */
500 GIC 126 : nhist = hslot.nvalues;
501 CBC 126 : hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
502 126 : hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
503 9468 : for (i = 0; i < nhist; i++)
504 ECB : {
505 : bool empty;
506 :
507 GIC 9342 : range_deserialize(rng_typcache, DatumGetRangeTypeP(hslot.values[i]),
508 CBC 9342 : &hist_lower[i], &hist_upper[i], &empty);
509 ECB : /* The histogram should not contain any empty ranges */
510 GIC 9342 : if (empty)
511 LBC 0 : elog(ERROR, "bounds histogram contains an empty range");
512 EUB : }
513 :
514 : /* @> and @< also need a histogram of range lengths */
515 GIC 126 : if (operator == OID_MULTIRANGE_CONTAINS_RANGE_OP ||
516 CBC 102 : operator == OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP ||
517 102 : operator == OID_MULTIRANGE_RANGE_CONTAINED_OP ||
518 ECB : operator == OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP)
519 : {
520 GIC 60 : if (!(HeapTupleIsValid(vardata->statsTuple) &&
521 CBC 30 : get_attstatsslot(&lslot, vardata->statsTuple,
522 ECB : STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
523 : InvalidOid,
524 : ATTSTATSSLOT_VALUES)))
525 : {
526 UIC 0 : free_attstatsslot(&hslot);
527 UBC 0 : return -1.0;
528 EUB : }
529 :
530 : /* check that it's a histogram, not just a dummy entry */
531 GIC 30 : if (lslot.nvalues < 2)
532 ECB : {
533 UIC 0 : free_attstatsslot(&lslot);
534 UBC 0 : free_attstatsslot(&hslot);
535 0 : return -1.0;
536 EUB : }
537 : }
538 : else
539 GIC 96 : memset(&lslot, 0, sizeof(lslot));
540 ECB :
541 : /* Extract the bounds of the constant value. */
542 GIC 126 : Assert(constval->rangeCount > 0);
543 CBC 126 : multirange_get_bounds(rng_typcache, constval, 0,
544 ECB : &const_lower, &tmp);
545 GIC 126 : multirange_get_bounds(rng_typcache, constval, constval->rangeCount - 1,
546 ECB : &tmp, &const_upper);
547 :
548 : /*
549 : * Calculate selectivity comparing the lower or upper bound of the
550 : * constant with the histogram of lower or upper bounds.
551 : */
552 GIC 126 : switch (operator)
553 ECB : {
554 UIC 0 : case OID_MULTIRANGE_LESS_OP:
555 EUB :
556 : /*
557 : * The regular b-tree comparison operators (<, <=, >, >=) compare
558 : * the lower bounds first, and the upper bounds for values with
559 : * equal lower bounds. Estimate that by comparing the lower bounds
560 : * only. This gives a fairly accurate estimate assuming there
561 : * aren't many rows with a lower bound equal to the constant's
562 : * lower bound.
563 : */
564 : hist_selec =
565 UIC 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
566 EUB : hist_lower, nhist, false);
567 UIC 0 : break;
568 EUB :
569 UIC 0 : case OID_MULTIRANGE_LESS_EQUAL_OP:
570 EUB : hist_selec =
571 UIC 0 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
572 EUB : hist_lower, nhist, true);
573 UIC 0 : break;
574 EUB :
575 UIC 0 : case OID_MULTIRANGE_GREATER_OP:
576 UBC 0 : hist_selec =
577 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
578 EUB : hist_lower, nhist, false);
579 UIC 0 : break;
580 EUB :
581 UIC 0 : case OID_MULTIRANGE_GREATER_EQUAL_OP:
582 UBC 0 : hist_selec =
583 0 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
584 EUB : hist_lower, nhist, true);
585 UIC 0 : break;
586 EUB :
587 GIC 12 : case OID_MULTIRANGE_LEFT_RANGE_OP:
588 ECB : case OID_MULTIRANGE_LEFT_MULTIRANGE_OP:
589 : /* var << const when upper(var) < lower(const) */
590 : hist_selec =
591 GIC 12 : calc_hist_selectivity_scalar(rng_typcache, &const_lower,
592 ECB : hist_upper, nhist, false);
593 GIC 12 : break;
594 ECB :
595 GIC 12 : case OID_MULTIRANGE_RIGHT_RANGE_OP:
596 ECB : case OID_MULTIRANGE_RIGHT_MULTIRANGE_OP:
597 : /* var >> const when lower(var) > upper(const) */
598 GIC 12 : hist_selec =
599 CBC 12 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_upper,
600 ECB : hist_lower, nhist, true);
601 GIC 12 : break;
602 ECB :
603 GIC 12 : case OID_MULTIRANGE_OVERLAPS_RIGHT_RANGE_OP:
604 ECB : case OID_MULTIRANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
605 : /* compare lower bounds */
606 GIC 12 : hist_selec =
607 CBC 12 : 1 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
608 ECB : hist_lower, nhist, false);
609 GIC 12 : break;
610 ECB :
611 GIC 12 : case OID_MULTIRANGE_OVERLAPS_LEFT_RANGE_OP:
612 ECB : case OID_MULTIRANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
613 : /* compare upper bounds */
614 : hist_selec =
615 GIC 12 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
616 ECB : hist_upper, nhist, true);
617 GIC 12 : break;
618 ECB :
619 GIC 42 : case OID_MULTIRANGE_OVERLAPS_RANGE_OP:
620 ECB : case OID_MULTIRANGE_OVERLAPS_MULTIRANGE_OP:
621 : case OID_MULTIRANGE_CONTAINS_ELEM_OP:
622 :
623 : /*
624 : * A && B <=> NOT (A << B OR A >> B).
625 : *
626 : * Since A << B and A >> B are mutually exclusive events we can
627 : * sum their probabilities to find probability of (A << B OR A >>
628 : * B).
629 : *
630 : * "multirange @> elem" is equivalent to "multirange &&
631 : * {[elem,elem]}". The caller already constructed the singular
632 : * range from the element constant, so just treat it the same as
633 : * &&.
634 : */
635 : hist_selec =
636 GIC 42 : calc_hist_selectivity_scalar(rng_typcache,
637 ECB : &const_lower, hist_upper,
638 : nhist, false);
639 GIC 42 : hist_selec +=
640 CBC 42 : (1.0 - calc_hist_selectivity_scalar(rng_typcache,
641 ECB : &const_upper, hist_lower,
642 : nhist, true));
643 GIC 42 : hist_selec = 1.0 - hist_selec;
644 CBC 42 : break;
645 ECB :
646 GIC 24 : case OID_MULTIRANGE_CONTAINS_RANGE_OP:
647 ECB : case OID_MULTIRANGE_CONTAINS_MULTIRANGE_OP:
648 : hist_selec =
649 GIC 24 : calc_hist_selectivity_contains(rng_typcache, &const_lower,
650 ECB : &const_upper, hist_lower, nhist,
651 : lslot.values, lslot.nvalues);
652 GIC 24 : break;
653 ECB :
654 GIC 12 : case OID_MULTIRANGE_MULTIRANGE_CONTAINED_OP:
655 ECB : case OID_RANGE_MULTIRANGE_CONTAINED_OP:
656 GIC 12 : if (const_lower.infinite)
657 ECB : {
658 : /*
659 : * Lower bound no longer matters. Just estimate the fraction
660 : * with an upper bound <= const upper bound
661 : */
662 : hist_selec =
663 UIC 0 : calc_hist_selectivity_scalar(rng_typcache, &const_upper,
664 EUB : hist_upper, nhist, true);
665 : }
666 GIC 12 : else if (const_upper.infinite)
667 ECB : {
668 UIC 0 : hist_selec =
669 UBC 0 : 1.0 - calc_hist_selectivity_scalar(rng_typcache, &const_lower,
670 EUB : hist_lower, nhist, false);
671 : }
672 : else
673 : {
674 : hist_selec =
675 GIC 12 : calc_hist_selectivity_contained(rng_typcache, &const_lower,
676 ECB : &const_upper, hist_lower, nhist,
677 : lslot.values, lslot.nvalues);
678 : }
679 GIC 12 : break;
680 ECB :
681 : /* filtered out by multirangesel() */
682 UIC 0 : case OID_RANGE_OVERLAPS_MULTIRANGE_OP:
683 EUB : case OID_RANGE_OVERLAPS_LEFT_MULTIRANGE_OP:
684 : case OID_RANGE_OVERLAPS_RIGHT_MULTIRANGE_OP:
685 : case OID_RANGE_LEFT_MULTIRANGE_OP:
686 : case OID_RANGE_RIGHT_MULTIRANGE_OP:
687 : case OID_RANGE_CONTAINS_MULTIRANGE_OP:
688 : case OID_MULTIRANGE_ELEM_CONTAINED_OP:
689 : case OID_MULTIRANGE_RANGE_CONTAINED_OP:
690 :
691 : default:
692 UIC 0 : elog(ERROR, "unknown multirange operator %u", operator);
693 EUB : hist_selec = -1.0; /* keep compiler quiet */
694 : break;
695 : }
696 :
697 GIC 126 : free_attstatsslot(&lslot);
698 CBC 126 : free_attstatsslot(&hslot);
699 ECB :
700 GIC 126 : return hist_selec;
701 ECB : }
702 :
703 :
704 : /*
705 : * Look up the fraction of values less than (or equal, if 'equal' argument
706 : * is true) a given const in a histogram of range bounds.
707 : */
708 : static double
709 GIC 132 : calc_hist_selectivity_scalar(TypeCacheEntry *typcache, const RangeBound *constbound,
710 ECB : const RangeBound *hist, int hist_nvalues, bool equal)
711 : {
712 : Selectivity selec;
713 : int index;
714 :
715 : /*
716 : * Find the histogram bin the given constant falls into. Estimate
717 : * selectivity as the number of preceding whole bins.
718 : */
719 GIC 132 : index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
720 CBC 132 : selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
721 ECB :
722 : /* Adjust using linear interpolation within the bin */
723 GIC 132 : if (index >= 0 && index < hist_nvalues - 1)
724 CBC 180 : selec += get_position(typcache, constbound, &hist[index],
725 90 : &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
726 ECB :
727 GIC 132 : return selec;
728 ECB : }
729 :
730 : /*
731 : * Binary search on an array of range bounds. Returns greatest index of range
732 : * bound in array which is less(less or equal) than given range bound. If all
733 : * range bounds in array are greater or equal(greater) than given range bound,
734 : * return -1. When "equal" flag is set conditions in brackets are used.
735 : *
736 : * This function is used in scalar operator selectivity estimation. Another
737 : * goal of this function is to find a histogram bin where to stop
738 : * interpolation of portion of bounds which are less than or equal to given bound.
739 : */
740 : static int
741 GIC 168 : rbound_bsearch(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist,
742 ECB : int hist_length, bool equal)
743 : {
744 GIC 168 : int lower = -1,
745 CBC 168 : upper = hist_length - 1,
746 ECB : cmp,
747 : middle;
748 :
749 GIC 1068 : while (lower < upper)
750 ECB : {
751 GIC 900 : middle = (lower + upper + 1) / 2;
752 CBC 900 : cmp = range_cmp_bounds(typcache, &hist[middle], value);
753 ECB :
754 GIC 900 : if (cmp < 0 || (equal && cmp == 0))
755 CBC 372 : lower = middle;
756 ECB : else
757 GIC 528 : upper = middle - 1;
758 ECB : }
759 GIC 168 : return lower;
760 ECB : }
761 :
762 :
763 : /*
764 : * Binary search on length histogram. Returns greatest index of range length in
765 : * histogram which is less than (less than or equal) the given length value. If
766 : * all lengths in the histogram are greater than (greater than or equal) the
767 : * given length, returns -1.
768 : */
769 : static int
770 GIC 180 : length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
771 ECB : double value, bool equal)
772 : {
773 GIC 180 : int lower = -1,
774 CBC 180 : upper = length_hist_nvalues - 1,
775 ECB : middle;
776 :
777 GIC 936 : while (lower < upper)
778 ECB : {
779 : double middleval;
780 :
781 GIC 756 : middle = (lower + upper + 1) / 2;
782 ECB :
783 GIC 756 : middleval = DatumGetFloat8(length_hist_values[middle]);
784 CBC 756 : if (middleval < value || (equal && middleval <= value))
785 372 : lower = middle;
786 ECB : else
787 GIC 384 : upper = middle - 1;
788 ECB : }
789 GIC 180 : return lower;
790 ECB : }
791 :
792 : /*
793 : * Get relative position of value in histogram bin in [0,1] range.
794 : */
795 : static float8
796 GIC 138 : get_position(TypeCacheEntry *typcache, const RangeBound *value, const RangeBound *hist1,
797 ECB : const RangeBound *hist2)
798 : {
799 GIC 138 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
800 ECB : float8 position;
801 :
802 GIC 138 : if (!hist1->infinite && !hist2->infinite)
803 ECB : {
804 : float8 bin_width;
805 :
806 : /*
807 : * Both bounds are finite. Assuming the subtype's comparison function
808 : * works sanely, the value must be finite, too, because it lies
809 : * somewhere between the bounds. If it doesn't, arbitrarily return
810 : * 0.5.
811 : */
812 GIC 102 : if (value->infinite)
813 LBC 0 : return 0.5;
814 EUB :
815 : /* Can't interpolate without subdiff function */
816 GIC 102 : if (!has_subdiff)
817 LBC 0 : return 0.5;
818 EUB :
819 : /* Calculate relative position using subdiff function. */
820 GIC 102 : bin_width = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
821 ECB : typcache->rng_collation,
822 GIC 102 : hist2->val,
823 CBC 102 : hist1->val));
824 102 : if (isnan(bin_width) || bin_width <= 0.0)
825 LBC 0 : return 0.5; /* punt for NaN or zero-width bin */
826 EUB :
827 GIC 102 : position = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
828 ECB : typcache->rng_collation,
829 GIC 102 : value->val,
830 CBC 102 : hist1->val))
831 ECB : / bin_width;
832 :
833 GIC 102 : if (isnan(position))
834 LBC 0 : return 0.5; /* punt for NaN from subdiff, Inf/Inf, etc */
835 EUB :
836 : /* Relative position must be in [0,1] range */
837 GIC 102 : position = Max(position, 0.0);
838 CBC 102 : position = Min(position, 1.0);
839 102 : return position;
840 ECB : }
841 GIC 36 : else if (hist1->infinite && !hist2->infinite)
842 ECB : {
843 : /*
844 : * Lower bin boundary is -infinite, upper is finite. If the value is
845 : * -infinite, return 0.0 to indicate it's equal to the lower bound.
846 : * Otherwise return 1.0 to indicate it's infinitely far from the lower
847 : * bound.
848 : */
849 GIC 30 : return ((value->infinite && value->lower) ? 0.0 : 1.0);
850 ECB : }
851 GIC 6 : else if (!hist1->infinite && hist2->infinite)
852 ECB : {
853 : /* same as above, but in reverse */
854 GIC 6 : return ((value->infinite && !value->lower) ? 1.0 : 0.0);
855 ECB : }
856 : else
857 : {
858 : /*
859 : * If both bin boundaries are infinite, they should be equal to each
860 : * other, and the value should also be infinite and equal to both
861 : * bounds. (But don't Assert that, to avoid crashing if a user creates
862 : * a datatype with a broken comparison function).
863 : *
864 : * Assume the value to lie in the middle of the infinite bounds.
865 : */
866 UIC 0 : return 0.5;
867 EUB : }
868 : }
869 :
870 :
871 : /*
872 : * Get relative position of value in a length histogram bin in [0,1] range.
873 : */
874 : static double
875 GIC 180 : get_len_position(double value, double hist1, double hist2)
876 ECB : {
877 GIC 180 : if (!isinf(hist1) && !isinf(hist2))
878 ECB : {
879 : /*
880 : * Both bounds are finite. The value should be finite too, because it
881 : * lies somewhere between the bounds. If it doesn't, just return
882 : * something.
883 : */
884 GIC 30 : if (isinf(value))
885 LBC 0 : return 0.5;
886 EUB :
887 GIC 30 : return 1.0 - (hist2 - value) / (hist2 - hist1);
888 ECB : }
889 GIC 150 : else if (isinf(hist1) && !isinf(hist2))
890 ECB : {
891 : /*
892 : * Lower bin boundary is -infinite, upper is finite. Return 1.0 to
893 : * indicate the value is infinitely far from the lower bound.
894 : */
895 UIC 0 : return 1.0;
896 EUB : }
897 GIC 150 : else if (isinf(hist1) && isinf(hist2))
898 ECB : {
899 : /* same as above, but in reverse */
900 UIC 0 : return 0.0;
901 EUB : }
902 : else
903 : {
904 : /*
905 : * If both bin boundaries are infinite, they should be equal to each
906 : * other, and the value should also be infinite and equal to both
907 : * bounds. (But don't Assert that, to avoid crashing unnecessarily if
908 : * the caller messes up)
909 : *
910 : * Assume the value to lie in the middle of the infinite bounds.
911 : */
912 GIC 150 : return 0.5;
913 ECB : }
914 : }
915 :
916 : /*
917 : * Measure distance between two range bounds.
918 : */
919 : static float8
920 GIC 204 : get_distance(TypeCacheEntry *typcache, const RangeBound *bound1, const RangeBound *bound2)
921 ECB : {
922 GIC 204 : bool has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
923 ECB :
924 GIC 204 : if (!bound1->infinite && !bound2->infinite)
925 ECB : {
926 : /*
927 : * Neither bound is infinite, use subdiff function or return default
928 : * value of 1.0 if no subdiff is available.
929 : */
930 GIC 120 : if (has_subdiff)
931 ECB : {
932 : float8 res;
933 :
934 GIC 120 : res = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
935 ECB : typcache->rng_collation,
936 GIC 120 : bound2->val,
937 CBC 120 : bound1->val));
938 ECB : /* Reject possible NaN result, also negative result */
939 GIC 120 : if (isnan(res) || res < 0.0)
940 LBC 0 : return 1.0;
941 EUB : else
942 GIC 120 : return res;
943 ECB : }
944 : else
945 UIC 0 : return 1.0;
946 EUB : }
947 GIC 84 : else if (bound1->infinite && bound2->infinite)
948 ECB : {
949 : /* Both bounds are infinite */
950 UIC 0 : if (bound1->lower == bound2->lower)
951 UBC 0 : return 0.0;
952 EUB : else
953 UIC 0 : return get_float8_infinity();
954 EUB : }
955 : else
956 : {
957 : /* One bound is infinite, the other is not */
958 GIC 84 : return get_float8_infinity();
959 ECB : }
960 : }
961 :
962 : /*
963 : * Calculate the average of function P(x), in the interval [length1, length2],
964 : * where P(x) is the fraction of tuples with length < x (or length <= x if
965 : * 'equal' is true).
966 : */
967 : static double
968 GIC 180 : calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
969 ECB : double length1, double length2, bool equal)
970 : {
971 : double frac;
972 : double A,
973 : B,
974 : PA,
975 : PB;
976 : double pos;
977 : int i;
978 : double area;
979 :
980 GIC 180 : Assert(length2 >= length1);
981 ECB :
982 GIC 180 : if (length2 < 0.0)
983 LBC 0 : return 0.0; /* shouldn't happen, but doesn't hurt to check */
984 EUB :
985 : /* All lengths in the table are <= infinite. */
986 GIC 180 : if (isinf(length2) && equal)
987 LBC 0 : return 1.0;
988 EUB :
989 : /*----------
990 : * The average of a function between A and B can be calculated by the
991 : * formula:
992 : *
993 : * B
994 : * 1 /
995 : * ------- | P(x)dx
996 : * B - A /
997 : * A
998 : *
999 : * The geometrical interpretation of the integral is the area under the
1000 : * graph of P(x). P(x) is defined by the length histogram. We calculate
1001 : * the area in a piecewise fashion, iterating through the length histogram
1002 : * bins. Each bin is a trapezoid:
1003 : *
1004 : * P(x2)
1005 : * /|
1006 : * / |
1007 : * P(x1)/ |
1008 : * | |
1009 : * | |
1010 : * ---+---+--
1011 : * x1 x2
1012 : *
1013 : * where x1 and x2 are the boundaries of the current histogram, and P(x1)
1014 : * and P(x1) are the cumulative fraction of tuples at the boundaries.
1015 : *
1016 : * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
1017 : *
1018 : * The first bin contains the lower bound passed by the caller, so we
1019 : * use linear interpolation between the previous and next histogram bin
1020 : * boundary to calculate P(x1). Likewise for the last bin: we use linear
1021 : * interpolation to calculate P(x2). For the bins in between, x1 and x2
1022 : * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
1023 : * P(x1) = (bin index) / (number of bins)
1024 : * P(x2) = (bin index + 1 / (number of bins)
1025 : */
1026 :
1027 : /* First bin, the one that contains lower bound */
1028 GIC 180 : i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
1029 CBC 180 : if (i >= length_hist_nvalues - 1)
1030 24 : return 1.0;
1031 ECB :
1032 GIC 156 : if (i < 0)
1033 ECB : {
1034 GIC 42 : i = 0;
1035 CBC 42 : pos = 0.0;
1036 ECB : }
1037 : else
1038 : {
1039 : /* interpolate length1's position in the bin */
1040 GIC 114 : pos = get_len_position(length1,
1041 CBC 114 : DatumGetFloat8(length_hist_values[i]),
1042 114 : DatumGetFloat8(length_hist_values[i + 1]));
1043 ECB : }
1044 GIC 156 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1045 CBC 156 : B = length1;
1046 ECB :
1047 : /*
1048 : * In the degenerate case that length1 == length2, simply return
1049 : * P(length1). This is not merely an optimization: if length1 == length2,
1050 : * we'd divide by zero later on.
1051 : */
1052 GIC 156 : if (length2 == length1)
1053 CBC 72 : return PB;
1054 ECB :
1055 : /*
1056 : * Loop through all the bins, until we hit the last bin, the one that
1057 : * contains the upper bound. (if lower and upper bounds are in the same
1058 : * bin, this falls out immediately)
1059 : */
1060 GIC 84 : area = 0.0;
1061 CBC 1584 : for (; i < length_hist_nvalues - 1; i++)
1062 ECB : {
1063 GIC 1584 : double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
1064 ECB :
1065 : /* check if we've reached the last bin */
1066 GIC 1584 : if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
1067 ECB : break;
1068 :
1069 : /* the upper bound of previous bin is the lower bound of this bin */
1070 GIC 1500 : A = B;
1071 CBC 1500 : PA = PB;
1072 ECB :
1073 GIC 1500 : B = bin_upper;
1074 CBC 1500 : PB = (double) i / (double) (length_hist_nvalues - 1);
1075 ECB :
1076 : /*
1077 : * Add the area of this trapezoid to the total. The point of the
1078 : * if-check is to avoid NaN, in the corner case that PA == PB == 0,
1079 : * and B - A == Inf. The area of a zero-height trapezoid (PA == PB ==
1080 : * 0) is zero, regardless of the width (B - A).
1081 : */
1082 GIC 1500 : if (PA > 0 || PB > 0)
1083 CBC 1476 : area += 0.5 * (PB + PA) * (B - A);
1084 ECB : }
1085 :
1086 : /* Last bin */
1087 GIC 84 : A = B;
1088 CBC 84 : PA = PB;
1089 ECB :
1090 GIC 84 : B = length2; /* last bin ends at the query upper bound */
1091 CBC 84 : if (i >= length_hist_nvalues - 1)
1092 LBC 0 : pos = 0.0;
1093 EUB : else
1094 : {
1095 GIC 84 : if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
1096 CBC 18 : pos = 0.0;
1097 ECB : else
1098 GIC 66 : pos = get_len_position(length2,
1099 CBC 66 : DatumGetFloat8(length_hist_values[i]),
1100 66 : DatumGetFloat8(length_hist_values[i + 1]));
1101 ECB : }
1102 GIC 84 : PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
1103 ECB :
1104 GIC 84 : if (PA > 0 || PB > 0)
1105 CBC 66 : area += 0.5 * (PB + PA) * (B - A);
1106 ECB :
1107 : /*
1108 : * Ok, we have calculated the area, ie. the integral. Divide by width to
1109 : * get the requested average.
1110 : *
1111 : * Avoid NaN arising from infinite / infinite. This happens at least if
1112 : * length2 is infinite. It's not clear what the correct value would be in
1113 : * that case, so 0.5 seems as good as any value.
1114 : */
1115 GIC 84 : if (isinf(area) && isinf(length2))
1116 CBC 24 : frac = 0.5;
1117 ECB : else
1118 GIC 60 : frac = area / (length2 - length1);
1119 ECB :
1120 GIC 84 : return frac;
1121 ECB : }
1122 :
1123 : /*
1124 : * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
1125 : * of multiranges that fall within the constant lower and upper bounds. This uses
1126 : * the histograms of range lower bounds and range lengths, on the assumption
1127 : * that the range lengths are independent of the lower bounds.
1128 : *
1129 : * The caller has already checked that constant lower and upper bounds are
1130 : * finite.
1131 : */
1132 : static double
1133 GIC 12 : calc_hist_selectivity_contained(TypeCacheEntry *typcache,
1134 ECB : const RangeBound *lower, RangeBound *upper,
1135 : const RangeBound *hist_lower, int hist_nvalues,
1136 : Datum *length_hist_values, int length_hist_nvalues)
1137 : {
1138 : int i,
1139 : upper_index;
1140 : float8 prev_dist;
1141 : double bin_width;
1142 : double upper_bin_width;
1143 : double sum_frac;
1144 :
1145 : /*
1146 : * Begin by finding the bin containing the upper bound, in the lower bound
1147 : * histogram. Any range with a lower bound > constant upper bound can't
1148 : * match, ie. there are no matches in bins greater than upper_index.
1149 : */
1150 GIC 12 : upper->inclusive = !upper->inclusive;
1151 CBC 12 : upper->lower = true;
1152 12 : upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
1153 ECB : false);
1154 :
1155 : /*
1156 : * If the upper bound value is below the histogram's lower limit, there
1157 : * are no matches.
1158 : */
1159 GIC 12 : if (upper_index < 0)
1160 LBC 0 : return 0.0;
1161 EUB :
1162 : /*
1163 : * If the upper bound value is at or beyond the histogram's upper limit,
1164 : * start our loop at the last actual bin, as though the upper bound were
1165 : * within that bin; get_position will clamp its result to 1.0 anyway.
1166 : * (This corresponds to assuming that the data population above the
1167 : * histogram's upper limit is empty, exactly like what we just assumed for
1168 : * the lower limit.)
1169 : */
1170 GIC 12 : upper_index = Min(upper_index, hist_nvalues - 2);
1171 ECB :
1172 : /*
1173 : * Calculate upper_bin_width, ie. the fraction of the (upper_index,
1174 : * upper_index + 1) bin which is greater than upper bound of query range
1175 : * using linear interpolation of subdiff function.
1176 : */
1177 GIC 12 : upper_bin_width = get_position(typcache, upper,
1178 CBC 12 : &hist_lower[upper_index],
1179 12 : &hist_lower[upper_index + 1]);
1180 ECB :
1181 : /*
1182 : * In the loop, dist and prev_dist are the distance of the "current" bin's
1183 : * lower and upper bounds from the constant upper bound.
1184 : *
1185 : * bin_width represents the width of the current bin. Normally it is 1.0,
1186 : * meaning a full width bin, but can be less in the corner cases: start
1187 : * and end of the loop. We start with bin_width = upper_bin_width, because
1188 : * we begin at the bin containing the upper bound.
1189 : */
1190 GIC 12 : prev_dist = 0.0;
1191 CBC 12 : bin_width = upper_bin_width;
1192 ECB :
1193 GIC 12 : sum_frac = 0.0;
1194 CBC 60 : for (i = upper_index; i >= 0; i--)
1195 ECB : {
1196 : double dist;
1197 : double length_hist_frac;
1198 GIC 60 : bool final_bin = false;
1199 ECB :
1200 : /*
1201 : * dist -- distance from upper bound of query range to lower bound of
1202 : * the current bin in the lower bound histogram. Or to the lower bound
1203 : * of the constant range, if this is the final bin, containing the
1204 : * constant lower bound.
1205 : */
1206 GIC 60 : if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
1207 ECB : {
1208 GIC 12 : dist = get_distance(typcache, lower, upper);
1209 ECB :
1210 : /*
1211 : * Subtract from bin_width the portion of this bin that we want to
1212 : * ignore.
1213 : */
1214 GIC 24 : bin_width -= get_position(typcache, lower, &hist_lower[i],
1215 CBC 12 : &hist_lower[i + 1]);
1216 12 : if (bin_width < 0.0)
1217 LBC 0 : bin_width = 0.0;
1218 GBC 12 : final_bin = true;
1219 ECB : }
1220 : else
1221 GIC 48 : dist = get_distance(typcache, &hist_lower[i], upper);
1222 ECB :
1223 : /*
1224 : * Estimate the fraction of tuples in this bin that are narrow enough
1225 : * to not exceed the distance to the upper bound of the query range.
1226 : */
1227 GIC 60 : length_hist_frac = calc_length_hist_frac(length_hist_values,
1228 ECB : length_hist_nvalues,
1229 : prev_dist, dist, true);
1230 :
1231 : /*
1232 : * Add the fraction of tuples in this bin, with a suitable length, to
1233 : * the total.
1234 : */
1235 GIC 60 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1236 ECB :
1237 GIC 60 : if (final_bin)
1238 CBC 12 : break;
1239 ECB :
1240 GIC 48 : bin_width = 1.0;
1241 CBC 48 : prev_dist = dist;
1242 ECB : }
1243 :
1244 GIC 12 : return sum_frac;
1245 ECB : }
1246 :
1247 : /*
1248 : * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
1249 : * of multiranges that contain the constant lower and upper bounds. This uses
1250 : * the histograms of range lower bounds and range lengths, on the assumption
1251 : * that the range lengths are independent of the lower bounds.
1252 : */
1253 : static double
1254 GIC 24 : calc_hist_selectivity_contains(TypeCacheEntry *typcache,
1255 ECB : const RangeBound *lower, const RangeBound *upper,
1256 : const RangeBound *hist_lower, int hist_nvalues,
1257 : Datum *length_hist_values, int length_hist_nvalues)
1258 : {
1259 : int i,
1260 : lower_index;
1261 : double bin_width,
1262 : lower_bin_width;
1263 : double sum_frac;
1264 : float8 prev_dist;
1265 :
1266 : /* Find the bin containing the lower bound of query range. */
1267 GIC 24 : lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
1268 ECB : true);
1269 :
1270 : /*
1271 : * If the lower bound value is below the histogram's lower limit, there
1272 : * are no matches.
1273 : */
1274 GIC 24 : if (lower_index < 0)
1275 LBC 0 : return 0.0;
1276 EUB :
1277 : /*
1278 : * If the lower bound value is at or beyond the histogram's upper limit,
1279 : * start our loop at the last actual bin, as though the upper bound were
1280 : * within that bin; get_position will clamp its result to 1.0 anyway.
1281 : * (This corresponds to assuming that the data population above the
1282 : * histogram's upper limit is empty, exactly like what we just assumed for
1283 : * the lower limit.)
1284 : */
1285 GIC 24 : lower_index = Min(lower_index, hist_nvalues - 2);
1286 ECB :
1287 : /*
1288 : * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
1289 : * lower_index + 1) bin which is greater than lower bound of query range
1290 : * using linear interpolation of subdiff function.
1291 : */
1292 GIC 24 : lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
1293 CBC 24 : &hist_lower[lower_index + 1]);
1294 ECB :
1295 : /*
1296 : * Loop through all the lower bound bins, smaller than the query lower
1297 : * bound. In the loop, dist and prev_dist are the distance of the
1298 : * "current" bin's lower and upper bounds from the constant upper bound.
1299 : * We begin from query lower bound, and walk backwards, so the first bin's
1300 : * upper bound is the query lower bound, and its distance to the query
1301 : * upper bound is the length of the query range.
1302 : *
1303 : * bin_width represents the width of the current bin. Normally it is 1.0,
1304 : * meaning a full width bin, except for the first bin, which is only
1305 : * counted up to the constant lower bound.
1306 : */
1307 GIC 24 : prev_dist = get_distance(typcache, lower, upper);
1308 CBC 24 : sum_frac = 0.0;
1309 24 : bin_width = lower_bin_width;
1310 144 : for (i = lower_index; i >= 0; i--)
1311 ECB : {
1312 : float8 dist;
1313 : double length_hist_frac;
1314 :
1315 : /*
1316 : * dist -- distance from upper bound of query range to current value
1317 : * of lower bound histogram or lower bound of query range (if we've
1318 : * reach it).
1319 : */
1320 GIC 120 : dist = get_distance(typcache, &hist_lower[i], upper);
1321 ECB :
1322 : /*
1323 : * Get average fraction of length histogram which covers intervals
1324 : * longer than (or equal to) distance to upper bound of query range.
1325 : */
1326 GIC 120 : length_hist_frac =
1327 CBC 120 : 1.0 - calc_length_hist_frac(length_hist_values,
1328 ECB : length_hist_nvalues,
1329 : prev_dist, dist, false);
1330 :
1331 GIC 120 : sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
1332 ECB :
1333 GIC 120 : bin_width = 1.0;
1334 CBC 120 : prev_dist = dist;
1335 ECB : }
1336 :
1337 GIC 24 : return sum_frac;
1338 ECB : }
|