Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * mcv.c
4 : : * POSTGRES multivariate MCV lists
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/statistics/mcv.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #include "postgres.h"
16 : :
17 : : #include <math.h>
18 : :
19 : : #include "access/htup_details.h"
20 : : #include "catalog/pg_statistic_ext.h"
21 : : #include "catalog/pg_statistic_ext_data.h"
22 : : #include "fmgr.h"
23 : : #include "funcapi.h"
24 : : #include "nodes/nodeFuncs.h"
25 : : #include "statistics/extended_stats_internal.h"
26 : : #include "statistics/statistics.h"
27 : : #include "utils/array.h"
28 : : #include "utils/builtins.h"
29 : : #include "utils/fmgrprotos.h"
30 : : #include "utils/lsyscache.h"
31 : : #include "utils/selfuncs.h"
32 : : #include "utils/syscache.h"
33 : : #include "utils/typcache.h"
34 : :
35 : : /*
36 : : * Computes size of a serialized MCV item, depending on the number of
37 : : * dimensions (columns) the statistic is defined on. The datum values are
38 : : * stored in a separate array (deduplicated, to minimize the size), and
39 : : * so the serialized items only store uint16 indexes into that array.
40 : : *
41 : : * Each serialized item stores (in this order):
42 : : *
43 : : * - indexes to values (ndim * sizeof(uint16))
44 : : * - null flags (ndim * sizeof(bool))
45 : : * - frequency (sizeof(double))
46 : : * - base_frequency (sizeof(double))
47 : : *
48 : : * There is no alignment padding within an MCV item.
49 : : * So in total each MCV item requires this many bytes:
50 : : *
51 : : * ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
52 : : */
53 : : #define ITEM_SIZE(ndims) \
54 : : ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
55 : :
56 : : /*
57 : : * Used to compute size of serialized MCV list representation.
58 : : */
59 : : #define MinSizeOfMCVList \
60 : : (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
61 : :
62 : : /*
63 : : * Size of the serialized MCV list, excluding the space needed for
64 : : * deduplicated per-dimension values. The macro is meant to be used
65 : : * when it's not yet safe to access the serialized info about amount
66 : : * of data for each column.
67 : : */
68 : : #define SizeOfMCVList(ndims,nitems) \
69 : : ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
70 : : ((ndims) * sizeof(DimensionInfo)) + \
71 : : ((nitems) * ITEM_SIZE(ndims)))
72 : :
73 : : static MultiSortSupport build_mss(StatsBuildData *data);
74 : :
75 : : static SortItem *build_distinct_groups(int numrows, SortItem *items,
76 : : MultiSortSupport mss, int *ndistinct);
77 : :
78 : : static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
79 : : MultiSortSupport mss, int *ncounts);
80 : :
81 : : static int count_distinct_groups(int numrows, SortItem *items,
82 : : MultiSortSupport mss);
83 : :
84 : : /*
85 : : * Compute new value for bitmap item, considering whether it's used for
86 : : * clauses connected by AND/OR.
87 : : */
88 : : #define RESULT_MERGE(value, is_or, match) \
89 : : ((is_or) ? ((value) || (match)) : ((value) && (match)))
90 : :
91 : : /*
92 : : * When processing a list of clauses, the bitmap item may get set to a value
93 : : * such that additional clauses can't change it. For example, when processing
94 : : * a list of clauses connected to AND, as soon as the item gets set to 'false'
95 : : * then it'll remain like that. Similarly clauses connected by OR and 'true'.
96 : : *
97 : : * Returns true when the value in the bitmap can't change no matter how the
98 : : * remaining clauses are evaluated.
99 : : */
100 : : #define RESULT_IS_FINAL(value, is_or) ((is_or) ? (value) : (!(value)))
101 : :
102 : : /*
103 : : * get_mincount_for_mcv_list
104 : : * Determine the minimum number of times a value needs to appear in
105 : : * the sample for it to be included in the MCV list.
106 : : *
107 : : * We want to keep only values that appear sufficiently often in the
108 : : * sample that it is reasonable to extrapolate their sample frequencies to
109 : : * the entire table. We do this by placing an upper bound on the relative
110 : : * standard error of the sample frequency, so that any estimates the
111 : : * planner generates from the MCV statistics can be expected to be
112 : : * reasonably accurate.
113 : : *
114 : : * Since we are sampling without replacement, the sample frequency of a
115 : : * particular value is described by a hypergeometric distribution. A
116 : : * common rule of thumb when estimating errors in this situation is to
117 : : * require at least 10 instances of the value in the sample, in which case
118 : : * the distribution can be approximated by a normal distribution, and
119 : : * standard error analysis techniques can be applied. Given a sample size
120 : : * of n, a population size of N, and a sample frequency of p=cnt/n, the
121 : : * standard error of the proportion p is given by
122 : : * SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
123 : : * where the second term is the finite population correction. To get
124 : : * reasonably accurate planner estimates, we impose an upper bound on the
125 : : * relative standard error of 20% -- i.e., SE/p < 0.2. This 20% relative
126 : : * error bound is fairly arbitrary, but has been found empirically to work
127 : : * well. Rearranging this formula gives a lower bound on the number of
128 : : * instances of the value seen:
129 : : * cnt > n*(N-n) / (N-n+0.04*n*(N-1))
130 : : * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
131 : : * case where n approaches 0 cannot happen in practice, since the sample
132 : : * size is at least 300. The case where n approaches N corresponds to
133 : : * sampling the whole table, in which case it is reasonable to keep
134 : : * the whole MCV list (have no lower bound), so it makes sense to apply
135 : : * this formula for all inputs, even though the above derivation is
136 : : * technically only valid when the right hand side is at least around 10.
137 : : *
138 : : * An alternative way to look at this formula is as follows -- assume that
139 : : * the number of instances of the value seen scales up to the entire
140 : : * table, so that the population count is K=N*cnt/n. Then the distribution
141 : : * in the sample is a hypergeometric distribution parameterised by N, n
142 : : * and K, and the bound above is mathematically equivalent to demanding
143 : : * that the standard deviation of that distribution is less than 20% of
144 : : * its mean. Thus the relative errors in any planner estimates produced
145 : : * from the MCV statistics are likely to be not too large.
146 : : */
147 : : static double
1845 tomas.vondra@postgre 148 :CBC 90 : get_mincount_for_mcv_list(int samplerows, double totalrows)
149 : : {
150 : 90 : double n = samplerows;
151 : 90 : double N = totalrows;
152 : : double numer,
153 : : denom;
154 : :
155 : 90 : numer = n * (N - n);
156 : 90 : denom = N - n + 0.04 * n * (N - 1);
157 : :
158 : : /* Guard against division by zero (possible if n = N = 1) */
159 [ + + ]: 90 : if (denom == 0.0)
160 : 6 : return 0.0;
161 : :
162 : 84 : return numer / denom;
163 : : }
164 : :
165 : : /*
166 : : * Builds MCV list from the set of sampled rows.
167 : : *
168 : : * The algorithm is quite simple:
169 : : *
170 : : * (1) sort the data (default collation, '<' for the data type)
171 : : *
172 : : * (2) count distinct groups, decide how many to keep
173 : : *
174 : : * (3) build the MCV list using the threshold determined in (2)
175 : : *
176 : : * (4) remove rows represented by the MCV from the sample
177 : : *
178 : : */
179 : : MCVList *
1115 180 : 90 : statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
181 : : {
182 : : int i,
183 : : numattrs,
184 : : numrows,
185 : : ngroups,
186 : : nitems;
187 : : double mincount;
188 : : SortItem *items;
189 : : SortItem *groups;
1845 190 : 90 : MCVList *mcvlist = NULL;
191 : : MultiSortSupport mss;
192 : :
193 : : /* comparator for all the columns */
1115 194 : 90 : mss = build_mss(data);
195 : :
196 : : /* sort the rows */
197 : 90 : items = build_sorted_items(data, &nitems, mss,
198 : : data->nattnums, data->attnums);
199 : :
1845 200 [ - + ]: 90 : if (!items)
1845 tomas.vondra@postgre 201 :UBC 0 : return NULL;
202 : :
203 : : /* for convenience */
1115 tomas.vondra@postgre 204 :CBC 90 : numattrs = data->nattnums;
205 : 90 : numrows = data->numrows;
206 : :
207 : : /* transform the sorted rows into groups (sorted by frequency) */
1845 208 : 90 : groups = build_distinct_groups(nitems, items, mss, &ngroups);
209 : :
210 : : /*
211 : : * The maximum number of MCV items to store, based on the statistics
212 : : * target we computed for the statistics object (from the target set for
213 : : * the object itself, attributes and the system default). In any case, we
214 : : * can't keep more groups than we have available.
215 : : */
1678 216 : 90 : nitems = stattarget;
1845 217 [ + + ]: 90 : if (nitems > ngroups)
218 : 54 : nitems = ngroups;
219 : :
220 : : /*
221 : : * Decide how many items to keep in the MCV list. We can't use the same
222 : : * algorithm as per-column MCV lists, because that only considers the
223 : : * actual group frequency - but we're primarily interested in how the
224 : : * actual frequency differs from the base frequency (product of simple
225 : : * per-column frequencies, as if the columns were independent).
226 : : *
227 : : * Using the same algorithm might exclude items that are close to the
228 : : * "average" frequency of the sample. But that does not say whether the
229 : : * observed frequency is close to the base frequency or not. We also need
230 : : * to consider unexpectedly uncommon items (again, compared to the base
231 : : * frequency), and the single-column algorithm does not have to.
232 : : *
233 : : * We simply decide how many items to keep by computing the minimum count
234 : : * using get_mincount_for_mcv_list() and then keep all items that seem to
235 : : * be more common than that.
236 : : */
237 : 90 : mincount = get_mincount_for_mcv_list(numrows, totalrows);
238 : :
239 : : /*
240 : : * Walk the groups until we find the first group with a count below the
241 : : * mincount threshold (the index of that group is the number of groups we
242 : : * want to keep).
243 : : */
244 [ + + ]: 4386 : for (i = 0; i < nitems; i++)
245 : : {
246 [ - + ]: 4296 : if (groups[i].count < mincount)
247 : : {
1845 tomas.vondra@postgre 248 :UBC 0 : nitems = i;
249 : 0 : break;
250 : : }
251 : : }
252 : :
253 : : /*
254 : : * At this point, we know the number of items for the MCV list. There
255 : : * might be none (for uniform distribution with many groups), and in that
256 : : * case, there will be no MCV list. Otherwise, construct the MCV list.
257 : : */
1845 tomas.vondra@postgre 258 [ + - ]:CBC 90 : if (nitems > 0)
259 : : {
260 : : int j;
261 : : SortItem key;
262 : : MultiSortSupport tmp;
263 : :
264 : : /* frequencies for values in each attribute */
265 : : SortItem **freqs;
266 : : int *nfreqs;
267 : :
268 : : /* used to search values */
1746 269 : 90 : tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
270 : : + sizeof(SortSupportData));
271 : :
272 : : /* compute frequencies for values in each column */
273 : 90 : nfreqs = (int *) palloc0(sizeof(int) * numattrs);
274 : 90 : freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
275 : :
276 : : /*
277 : : * Allocate the MCV list structure, set the global parameters.
278 : : */
1844 279 : 90 : mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
280 : 90 : sizeof(MCVItem) * nitems);
281 : :
1845 282 : 90 : mcvlist->magic = STATS_MCV_MAGIC;
283 : 90 : mcvlist->type = STATS_MCV_TYPE_BASIC;
284 : 90 : mcvlist->ndimensions = numattrs;
285 : 90 : mcvlist->nitems = nitems;
286 : :
287 : : /* store info about data type OIDs */
288 [ + + ]: 345 : for (i = 0; i < numattrs; i++)
1115 289 : 255 : mcvlist->types[i] = data->stats[i]->attrtypid;
290 : :
291 : : /* Copy the first chunk of groups into the result. */
1845 292 [ + + ]: 4386 : for (i = 0; i < nitems; i++)
293 : : {
294 : : /* just point to the proper place in the list */
1844 295 : 4296 : MCVItem *item = &mcvlist->items[i];
296 : :
297 : 4296 : item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
298 : 4296 : item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
299 : :
300 : : /* copy values for the group */
1845 301 : 4296 : memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
302 : 4296 : memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
303 : :
304 : : /* groups should be sorted by frequency in descending order */
305 [ + + - + ]: 4296 : Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
306 : :
307 : : /* group frequency */
308 : 4296 : item->frequency = (double) groups[i].count / numrows;
309 : :
310 : : /* base frequency, if the attributes were independent */
311 : 4296 : item->base_frequency = 1.0;
312 [ + + ]: 16425 : for (j = 0; j < numattrs; j++)
313 : : {
314 : : SortItem *freq;
315 : :
316 : : /* single dimension */
1746 317 : 12129 : tmp->ndims = 1;
318 : 12129 : tmp->ssup[0] = mss->ssup[j];
319 : :
320 : : /* fill search key */
321 : 12129 : key.values = &groups[i].values[j];
322 : 12129 : key.isnull = &groups[i].isnull[j];
323 : :
324 : 12129 : freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
325 : : sizeof(SortItem),
326 : : multi_sort_compare, tmp);
327 : :
328 : 12129 : item->base_frequency *= ((double) freq->count) / numrows;
329 : : }
330 : : }
331 : :
332 : 90 : pfree(nfreqs);
333 : 90 : pfree(freqs);
334 : : }
335 : :
1845 336 : 90 : pfree(items);
337 : 90 : pfree(groups);
338 : :
339 : 90 : return mcvlist;
340 : : }
341 : :
342 : : /*
343 : : * build_mss
344 : : * Build a MultiSortSupport for the given StatsBuildData.
345 : : */
346 : : static MultiSortSupport
1115 347 : 90 : build_mss(StatsBuildData *data)
348 : : {
349 : : int i;
350 : 90 : int numattrs = data->nattnums;
351 : :
352 : : /* Sort by multiple columns (using array of SortSupport) */
1845 353 : 90 : MultiSortSupport mss = multi_sort_init(numattrs);
354 : :
355 : : /* prepare the sort functions for all the attributes */
356 [ + + ]: 345 : for (i = 0; i < numattrs; i++)
357 : : {
1115 358 : 255 : VacAttrStats *colstat = data->stats[i];
359 : : TypeCacheEntry *type;
360 : :
1845 361 : 255 : type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
362 [ - + ]: 255 : if (type->lt_opr == InvalidOid) /* shouldn't happen */
1845 tomas.vondra@postgre 363 [ # # ]:UBC 0 : elog(ERROR, "cache lookup failed for ordering operator for type %u",
364 : : colstat->attrtypid);
365 : :
1732 tomas.vondra@postgre 366 :CBC 255 : multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
367 : : }
368 : :
1845 369 : 90 : return mss;
370 : : }
371 : :
372 : : /*
373 : : * count_distinct_groups
374 : : * Count distinct combinations of SortItems in the array.
375 : : *
376 : : * The array is assumed to be sorted according to the MultiSortSupport.
377 : : */
378 : : static int
379 : 90 : count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
380 : : {
381 : : int i;
382 : : int ndistinct;
383 : :
384 : 90 : ndistinct = 1;
385 [ + + ]: 229509 : for (i = 1; i < numrows; i++)
386 : : {
387 : : /* make sure the array really is sorted */
388 [ - + ]: 229419 : Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
389 : :
390 [ + + ]: 229419 : if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
391 : 38856 : ndistinct += 1;
392 : : }
393 : :
394 : 90 : return ndistinct;
395 : : }
396 : :
397 : : /*
398 : : * compare_sort_item_count
399 : : * Comparator for sorting items by count (frequencies) in descending
400 : : * order.
401 : : */
402 : : static int
642 tgl@sss.pgh.pa.us 403 : 39378 : compare_sort_item_count(const void *a, const void *b, void *arg)
404 : : {
1845 tomas.vondra@postgre 405 : 39378 : SortItem *ia = (SortItem *) a;
406 : 39378 : SortItem *ib = (SortItem *) b;
407 : :
408 [ + + ]: 39378 : if (ia->count == ib->count)
409 : 38904 : return 0;
410 [ + + ]: 474 : else if (ia->count > ib->count)
411 : 315 : return -1;
412 : :
413 : 159 : return 1;
414 : : }
415 : :
416 : : /*
417 : : * build_distinct_groups
418 : : * Build an array of SortItems for distinct groups and counts matching
419 : : * items.
420 : : *
421 : : * The 'items' array is assumed to be sorted.
422 : : */
423 : : static SortItem *
424 : 90 : build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
425 : : int *ndistinct)
426 : : {
427 : : int i,
428 : : j;
429 : 90 : int ngroups = count_distinct_groups(numrows, items, mss);
430 : :
431 : 90 : SortItem *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
432 : :
433 : 90 : j = 0;
434 : 90 : groups[0] = items[0];
435 : 90 : groups[0].count = 1;
436 : :
437 [ + + ]: 229509 : for (i = 1; i < numrows; i++)
438 : : {
439 : : /* Assume sorted in ascending order. */
440 [ - + ]: 229419 : Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
441 : :
442 : : /* New distinct group detected. */
443 [ + + ]: 229419 : if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
444 : : {
445 : 38856 : groups[++j] = items[i];
446 : 38856 : groups[j].count = 0;
447 : : }
448 : :
449 : 229419 : groups[j].count++;
450 : : }
451 : :
452 : : /* ensure we filled the expected number of distinct groups */
453 [ - + ]: 90 : Assert(j + 1 == ngroups);
454 : :
455 : : /* Sort the distinct groups by frequency (in descending order). */
432 peter@eisentraut.org 456 : 90 : qsort_interruptible(groups, ngroups, sizeof(SortItem),
457 : : compare_sort_item_count, NULL);
458 : :
1845 tomas.vondra@postgre 459 : 90 : *ndistinct = ngroups;
460 : 90 : return groups;
461 : : }
462 : :
463 : : /* compare sort items (single dimension) */
464 : : static int
1746 465 : 417045 : sort_item_compare(const void *a, const void *b, void *arg)
466 : : {
1431 tgl@sss.pgh.pa.us 467 : 417045 : SortSupport ssup = (SortSupport) arg;
1746 tomas.vondra@postgre 468 : 417045 : SortItem *ia = (SortItem *) a;
469 : 417045 : SortItem *ib = (SortItem *) b;
470 : :
471 : 834090 : return ApplySortComparator(ia->values[0], ia->isnull[0],
472 : 417045 : ib->values[0], ib->isnull[0],
473 : : ssup);
474 : : }
475 : :
476 : : /*
477 : : * build_column_frequencies
478 : : * Compute frequencies of values in each column.
479 : : *
480 : : * This returns an array of SortItems for each attribute the MCV is built
481 : : * on, with a frequency (number of occurrences) for each value. This is
482 : : * then used to compute "base" frequency of MCV items.
483 : : *
484 : : * All the memory is allocated in a single chunk, so that a single pfree
485 : : * is enough to release it. We do not allocate space for values/isnull
486 : : * arrays in the SortItems, because we can simply point into the input
487 : : * groups directly.
488 : : */
489 : : static SortItem **
490 : 90 : build_column_frequencies(SortItem *groups, int ngroups,
491 : : MultiSortSupport mss, int *ncounts)
492 : : {
493 : : int i,
494 : : dim;
495 : : SortItem **result;
496 : : char *ptr;
497 : :
498 [ - + ]: 90 : Assert(groups);
499 [ - + ]: 90 : Assert(ncounts);
500 : :
501 : : /* allocate arrays for all columns as a single chunk */
502 : 90 : ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
1431 tgl@sss.pgh.pa.us 503 : 90 : mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
504 : :
505 : : /* initial array of pointers */
1746 tomas.vondra@postgre 506 : 90 : result = (SortItem **) ptr;
507 : 90 : ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
508 : :
509 [ + + ]: 345 : for (dim = 0; dim < mss->ndims; dim++)
510 : : {
1431 tgl@sss.pgh.pa.us 511 : 255 : SortSupport ssup = &mss->ssup[dim];
512 : :
513 : : /* array of values for a single column */
1746 tomas.vondra@postgre 514 : 255 : result[dim] = (SortItem *) ptr;
515 : 255 : ptr += MAXALIGN(sizeof(SortItem) * ngroups);
516 : :
517 : : /* extract data for the dimension */
518 [ + + ]: 113484 : for (i = 0; i < ngroups; i++)
519 : : {
520 : : /* point into the input groups */
521 : 113229 : result[dim][i].values = &groups[i].values[dim];
522 : 113229 : result[dim][i].isnull = &groups[i].isnull[dim];
523 : 113229 : result[dim][i].count = groups[i].count;
524 : : }
525 : :
526 : : /* sort the values, deduplicate */
432 peter@eisentraut.org 527 : 255 : qsort_interruptible(result[dim], ngroups, sizeof(SortItem),
528 : : sort_item_compare, ssup);
529 : :
530 : : /*
531 : : * Identify distinct values, compute frequency (there might be
532 : : * multiple MCV items containing this value, so we need to sum counts
533 : : * from all of them.
534 : : */
1746 tomas.vondra@postgre 535 : 255 : ncounts[dim] = 1;
536 [ + + ]: 113229 : for (i = 1; i < ngroups; i++)
537 : : {
1431 tgl@sss.pgh.pa.us 538 [ + + ]: 112974 : if (sort_item_compare(&result[dim][i - 1], &result[dim][i], ssup) == 0)
539 : : {
540 : 68091 : result[dim][ncounts[dim] - 1].count += result[dim][i].count;
1746 tomas.vondra@postgre 541 : 68091 : continue;
542 : : }
543 : :
544 : 44883 : result[dim][ncounts[dim]] = result[dim][i];
545 : :
546 : 44883 : ncounts[dim]++;
547 : : }
548 : : }
549 : :
550 : 90 : return result;
551 : : }
552 : :
553 : : /*
554 : : * statext_mcv_load
555 : : * Load the MCV list for the indicated pg_statistic_ext_data tuple.
556 : : */
557 : : MCVList *
819 558 : 240 : statext_mcv_load(Oid mvoid, bool inh)
559 : : {
560 : : MCVList *result;
561 : : bool isnull;
562 : : Datum mcvlist;
563 : 240 : HeapTuple htup = SearchSysCache2(STATEXTDATASTXOID,
564 : : ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
565 : :
1845 566 [ - + ]: 240 : if (!HeapTupleIsValid(htup))
1845 tomas.vondra@postgre 567 [ # # ]:UBC 0 : elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
568 : :
1767 tomas.vondra@postgre 569 :CBC 240 : mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
570 : : Anum_pg_statistic_ext_data_stxdmcv, &isnull);
571 : :
1845 572 [ - + ]: 240 : if (isnull)
1844 tomas.vondra@postgre 573 [ # # ]:UBC 0 : elog(ERROR,
574 : : "requested statistics kind \"%c\" is not yet built for statistics object %u",
575 : : STATS_EXT_MCV, mvoid);
576 : :
1844 tomas.vondra@postgre 577 :CBC 240 : result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
578 : :
579 : 240 : ReleaseSysCache(htup);
580 : :
581 : 240 : return result;
582 : : }
583 : :
584 : :
585 : : /*
586 : : * statext_mcv_serialize
587 : : * Serialize MCV list into a pg_mcv_list value.
588 : : *
589 : : * The MCV items may include values of various data types, and it's reasonable
590 : : * to expect redundancy (values for a given attribute, repeated for multiple
591 : : * MCV list items). So we deduplicate the values into arrays, and then replace
592 : : * the values by indexes into those arrays.
593 : : *
594 : : * The overall structure of the serialized representation looks like this:
595 : : *
596 : : * +---------------+----------------+---------------------+-------+
597 : : * | header fields | dimension info | deduplicated values | items |
598 : : * +---------------+----------------+---------------------+-------+
599 : : *
600 : : * Where dimension info stores information about the type of the K-th
601 : : * attribute (e.g. typlen, typbyval and length of deduplicated values).
602 : : * Deduplicated values store deduplicated values for each attribute. And
603 : : * items store the actual MCV list items, with values replaced by indexes into
604 : : * the arrays.
605 : : *
606 : : * When serializing the items, we use uint16 indexes. The number of MCV items
607 : : * is limited by the statistics target (which is capped to 10k at the moment).
608 : : * We might increase this to 65k and still fit into uint16, so there's a bit of
609 : : * slack. Furthermore, this limit is on the number of distinct values per column,
610 : : * and we usually have few of those (and various combinations of them for the
611 : : * those MCV list). So uint16 seems fine for now.
612 : : *
613 : : * We don't really expect the serialization to save as much space as for
614 : : * histograms, as we are not doing any bucket splits (which is the source
615 : : * of high redundancy in histograms).
616 : : *
617 : : * TODO: Consider packing boolean flags (NULL) for each item into a single char
618 : : * (or a longer type) instead of using an array of bool items.
619 : : */
620 : : bytea *
1789 tgl@sss.pgh.pa.us 621 : 90 : statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
622 : : {
623 : : int i;
624 : : int dim;
1845 tomas.vondra@postgre 625 : 90 : int ndims = mcvlist->ndimensions;
626 : :
627 : : SortSupport ssup;
628 : : DimensionInfo *info;
629 : :
630 : : Size total_length;
631 : :
632 : : /* serialized items (indexes into arrays, etc.) */
633 : : bytea *raw;
634 : : char *ptr;
635 : : char *endptr PG_USED_FOR_ASSERTS_ONLY;
636 : :
637 : : /* values per dimension (and number of non-NULL values) */
638 : 90 : Datum **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
639 : 90 : int *counts = (int *) palloc0(sizeof(int) * ndims);
640 : :
641 : : /*
642 : : * We'll include some rudimentary information about the attribute types
643 : : * (length, by-val flag), so that we don't have to look them up while
644 : : * deserializing the MCV list (we already have the type OID in the
645 : : * header). This is safe because when changing the type of the attribute
646 : : * the statistics gets dropped automatically. We need to store the info
647 : : * about the arrays of deduplicated values anyway.
648 : : */
649 : 90 : info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
650 : :
651 : : /* sort support data for all attributes included in the MCV list */
652 : 90 : ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
653 : :
654 : : /* collect and deduplicate values for each dimension (attribute) */
655 [ + + ]: 345 : for (dim = 0; dim < ndims; dim++)
656 : : {
657 : : int ndistinct;
658 : : TypeCacheEntry *typentry;
659 : :
660 : : /*
661 : : * Lookup the LT operator (can't get it from stats extra_data, as we
662 : : * don't know how to interpret that - scalar vs. array etc.).
663 : : */
664 : 255 : typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
665 : :
666 : : /* copy important info about the data type (length, by-value) */
667 : 255 : info[dim].typlen = stats[dim]->attrtype->typlen;
668 : 255 : info[dim].typbyval = stats[dim]->attrtype->typbyval;
669 : :
670 : : /* allocate space for values in the attribute and collect them */
671 : 255 : values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
672 : :
673 [ + + ]: 12384 : for (i = 0; i < mcvlist->nitems; i++)
674 : : {
675 : : /* skip NULL values - we don't need to deduplicate those */
1844 676 [ + + ]: 12129 : if (mcvlist->items[i].isnull[dim])
1845 677 : 33 : continue;
678 : :
679 : : /* append the value at the end */
1844 680 : 12096 : values[dim][counts[dim]] = mcvlist->items[i].values[dim];
1845 681 : 12096 : counts[dim] += 1;
682 : : }
683 : :
684 : : /* if there are just NULL values in this dimension, we're done */
685 [ + + ]: 255 : if (counts[dim] == 0)
686 : 3 : continue;
687 : :
688 : : /* sort and deduplicate the data */
689 : 252 : ssup[dim].ssup_cxt = CurrentMemoryContext;
1732 690 : 252 : ssup[dim].ssup_collation = stats[dim]->attrcollid;
1845 691 : 252 : ssup[dim].ssup_nulls_first = false;
692 : :
693 : 252 : PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
694 : :
642 tgl@sss.pgh.pa.us 695 : 252 : qsort_interruptible(values[dim], counts[dim], sizeof(Datum),
696 : 252 : compare_scalars_simple, &ssup[dim]);
697 : :
698 : : /*
699 : : * Walk through the array and eliminate duplicate values, but keep the
700 : : * ordering (so that we can do a binary search later). We know there's
701 : : * at least one item as (counts[dim] != 0), so we can skip the first
702 : : * element.
703 : : */
1845 tomas.vondra@postgre 704 : 252 : ndistinct = 1; /* number of distinct values */
705 [ + + ]: 12096 : for (i = 1; i < counts[dim]; i++)
706 : : {
707 : : /* expect sorted array */
708 [ - + ]: 11844 : Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
709 : :
710 : : /* if the value is the same as the previous one, we can skip it */
711 [ + + ]: 11844 : if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
712 : 4977 : continue;
713 : :
714 : 6867 : values[dim][ndistinct] = values[dim][i];
715 : 6867 : ndistinct += 1;
716 : : }
717 : :
718 : : /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
719 [ - + ]: 252 : Assert(ndistinct <= PG_UINT16_MAX);
720 : :
721 : : /*
722 : : * Store additional info about the attribute - number of deduplicated
723 : : * values, and also size of the serialized data. For fixed-length data
724 : : * types this is trivial to compute, for varwidth types we need to
725 : : * actually walk the array and sum the sizes.
726 : : */
727 : 252 : info[dim].nvalues = ndistinct;
728 : :
1431 tgl@sss.pgh.pa.us 729 [ + + ]: 252 : if (info[dim].typbyval) /* by-value data types */
730 : : {
1745 tomas.vondra@postgre 731 : 186 : info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
732 : :
733 : : /*
734 : : * We copy the data into the MCV item during deserialization, so
735 : : * we don't need to allocate any extra space.
736 : : */
737 : 186 : info[dim].nbytes_aligned = 0;
738 : : }
1431 tgl@sss.pgh.pa.us 739 [ + + ]: 66 : else if (info[dim].typlen > 0) /* fixed-length by-ref */
740 : : {
741 : : /*
742 : : * We don't care about alignment in the serialized data, so we
743 : : * pack the data as much as possible. But we also track how much
744 : : * data will be needed after deserialization, and in that case we
745 : : * need to account for alignment of each item.
746 : : *
747 : : * Note: As the items are fixed-length, we could easily compute
748 : : * this during deserialization, but we do it here anyway.
749 : : */
1845 tomas.vondra@postgre 750 : 12 : info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
1745 751 : 12 : info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
752 : : }
1845 753 [ + - ]: 54 : else if (info[dim].typlen == -1) /* varlena */
754 : : {
755 : 54 : info[dim].nbytes = 0;
1745 756 : 54 : info[dim].nbytes_aligned = 0;
1845 757 [ + + ]: 1437 : for (i = 0; i < info[dim].nvalues; i++)
758 : : {
759 : : Size len;
760 : :
761 : : /*
762 : : * For varlena values, we detoast the values and store the
763 : : * length and data separately. We don't bother with alignment
764 : : * here, which means that during deserialization we need to
765 : : * copy the fields and only access the copies.
766 : : */
767 : 1383 : values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
768 : :
769 : : /* serialized length (uint32 length + data) */
1745 770 [ - + - - : 1383 : len = VARSIZE_ANY_EXHDR(values[dim][i]);
- - - - -
+ ]
1431 tgl@sss.pgh.pa.us 771 : 1383 : info[dim].nbytes += sizeof(uint32); /* length */
772 : 1383 : info[dim].nbytes += len; /* value (no header) */
773 : :
774 : : /*
775 : : * During deserialization we'll build regular varlena values
776 : : * with full headers, and we need to align them properly.
777 : : */
1745 tomas.vondra@postgre 778 : 1383 : info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
779 : : }
780 : : }
1845 tomas.vondra@postgre 781 [ # # ]:UBC 0 : else if (info[dim].typlen == -2) /* cstring */
782 : : {
783 : 0 : info[dim].nbytes = 0;
1745 784 : 0 : info[dim].nbytes_aligned = 0;
1845 785 [ # # ]: 0 : for (i = 0; i < info[dim].nvalues; i++)
786 : : {
787 : : Size len;
788 : :
789 : : /*
790 : : * cstring is handled similar to varlena - first we store the
791 : : * length as uint32 and then the data. We don't care about
792 : : * alignment, which means that during deserialization we need
793 : : * to copy the fields and only access the copies.
794 : : */
795 : :
796 : : /* c-strings include terminator, so +1 byte */
1843 797 : 0 : len = strlen(DatumGetCString(values[dim][i])) + 1;
1431 tgl@sss.pgh.pa.us 798 : 0 : info[dim].nbytes += sizeof(uint32); /* length */
799 : 0 : info[dim].nbytes += len; /* value */
800 : :
801 : : /* space needed for properly aligned deserialized copies */
1745 tomas.vondra@postgre 802 : 0 : info[dim].nbytes_aligned += MAXALIGN(len);
803 : : }
804 : : }
805 : :
806 : : /* we know (count>0) so there must be some data */
1845 tomas.vondra@postgre 807 [ - + ]:CBC 252 : Assert(info[dim].nbytes > 0);
808 : : }
809 : :
810 : : /*
811 : : * Now we can finally compute how much space we'll actually need for the
812 : : * whole serialized MCV list (varlena header, MCV header, dimension info
813 : : * for each attribute, deduplicated values and items).
814 : : */
1431 tgl@sss.pgh.pa.us 815 : 90 : total_length = (3 * sizeof(uint32)) /* magic + type + nitems */
816 : : + sizeof(AttrNumber) /* ndimensions */
817 : 90 : + (ndims * sizeof(Oid)); /* attribute types */
818 : :
819 : : /* dimension info */
1745 tomas.vondra@postgre 820 : 90 : total_length += ndims * sizeof(DimensionInfo);
821 : :
822 : : /* add space for the arrays of deduplicated values */
1845 823 [ + + ]: 345 : for (i = 0; i < ndims; i++)
1745 824 : 255 : total_length += info[i].nbytes;
825 : :
826 : : /*
827 : : * And finally account for the items (those are fixed-length, thanks to
828 : : * replacing values with uint16 indexes into the deduplicated arrays).
829 : : */
830 : 90 : total_length += mcvlist->nitems * ITEM_SIZE(dim);
831 : :
832 : : /*
833 : : * Allocate space for the whole serialized MCV list (we'll skip bytes, so
834 : : * we set them to zero to make the result more compressible).
835 : : */
836 : 90 : raw = (bytea *) palloc0(VARHDRSZ + total_length);
837 : 90 : SET_VARSIZE(raw, VARHDRSZ + total_length);
838 : :
1838 839 : 90 : ptr = VARDATA(raw);
1745 840 : 90 : endptr = ptr + total_length;
841 : :
842 : : /* copy the MCV list header fields, one by one */
1838 843 : 90 : memcpy(ptr, &mcvlist->magic, sizeof(uint32));
844 : 90 : ptr += sizeof(uint32);
845 : :
846 : 90 : memcpy(ptr, &mcvlist->type, sizeof(uint32));
847 : 90 : ptr += sizeof(uint32);
848 : :
849 : 90 : memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
850 : 90 : ptr += sizeof(uint32);
851 : :
852 : 90 : memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
853 : 90 : ptr += sizeof(AttrNumber);
854 : :
855 : 90 : memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
856 : 90 : ptr += (sizeof(Oid) * ndims);
857 : :
858 : : /* store information about the attributes (data amounts, ...) */
1844 859 : 90 : memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
1745 860 : 90 : ptr += sizeof(DimensionInfo) * ndims;
861 : :
862 : : /* Copy the deduplicated values for all attributes to the output. */
1845 863 [ + + ]: 345 : for (dim = 0; dim < ndims; dim++)
864 : : {
865 : : /* remember the starting point for Asserts later */
1844 866 : 255 : char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
867 : :
1845 868 [ + + ]: 7374 : for (i = 0; i < info[dim].nvalues; i++)
869 : : {
1844 870 : 7119 : Datum value = values[dim][i];
871 : :
1845 872 [ + + ]: 7119 : if (info[dim].typbyval) /* passed by value */
873 : : {
874 : : Datum tmp;
875 : :
876 : : /*
877 : : * For byval types, we need to copy just the significant bytes
878 : : * - we can't use memcpy directly, as that assumes
879 : : * little-endian behavior. store_att_byval does almost what
880 : : * we need, but it requires a properly aligned buffer - the
881 : : * output buffer does not guarantee that. So we simply use a
882 : : * local Datum variable (which guarantees proper alignment),
883 : : * and then copy the value from it.
884 : : */
1844 885 : 5181 : store_att_byval(&tmp, value, info[dim].typlen);
886 : :
887 : 5181 : memcpy(ptr, &tmp, info[dim].typlen);
888 : 5181 : ptr += info[dim].typlen;
889 : : }
1785 akapila@postgresql.o 890 [ + + ]: 1938 : else if (info[dim].typlen > 0) /* passed by reference */
891 : : {
892 : : /* no special alignment needed, treated as char array */
1844 tomas.vondra@postgre 893 : 555 : memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
894 : 555 : ptr += info[dim].typlen;
895 : : }
1845 896 [ + - ]: 1383 : else if (info[dim].typlen == -1) /* varlena */
897 : : {
1745 898 [ - + - - : 1383 : uint32 len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
- - - - -
+ ]
899 : :
900 : : /* copy the length */
901 : 1383 : memcpy(ptr, &len, sizeof(uint32));
902 : 1383 : ptr += sizeof(uint32);
903 : :
904 : : /* data from the varlena value (without the header) */
905 [ - + ]: 1383 : memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
906 : 1383 : ptr += len;
907 : : }
1845 tomas.vondra@postgre 908 [ # # ]:UBC 0 : else if (info[dim].typlen == -2) /* cstring */
909 : : {
1745 910 : 0 : uint32 len = (uint32) strlen(DatumGetCString(value)) + 1;
911 : :
912 : : /* copy the length */
913 : 0 : memcpy(ptr, &len, sizeof(uint32));
914 : 0 : ptr += sizeof(uint32);
915 : :
916 : : /* value */
1844 917 : 0 : memcpy(ptr, DatumGetCString(value), len);
1745 918 : 0 : ptr += len;
919 : : }
920 : :
921 : : /* no underflows or overflows */
1844 tomas.vondra@postgre 922 [ + - - + ]:CBC 7119 : Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
923 : : }
924 : :
925 : : /* we should get exactly nbytes of data for this dimension */
926 [ - + ]: 255 : Assert((ptr - start) == info[dim].nbytes);
927 : : }
928 : :
929 : : /* Serialize the items, with uint16 indexes instead of the values. */
1845 930 [ + + ]: 4386 : for (i = 0; i < mcvlist->nitems; i++)
931 : : {
1844 932 : 4296 : MCVItem *mcvitem = &mcvlist->items[i];
933 : :
934 : : /* don't write beyond the allocated space */
1745 935 [ - + ]: 4296 : Assert(ptr <= (endptr - ITEM_SIZE(dim)));
936 : :
937 : : /* copy NULL and frequency flags into the serialized MCV */
938 : 4296 : memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
939 : 4296 : ptr += sizeof(bool) * ndims;
940 : :
941 : 4296 : memcpy(ptr, &mcvitem->frequency, sizeof(double));
942 : 4296 : ptr += sizeof(double);
943 : :
944 : 4296 : memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
945 : 4296 : ptr += sizeof(double);
946 : :
947 : : /* store the indexes last */
1845 948 [ + + ]: 16425 : for (dim = 0; dim < ndims; dim++)
949 : : {
1745 950 : 12129 : uint16 index = 0;
951 : : Datum *value;
952 : :
953 : : /* do the lookup only for non-NULL values */
954 [ + + ]: 12129 : if (!mcvitem->isnull[dim])
955 : : {
956 : 12096 : value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
957 : 12096 : info[dim].nvalues, sizeof(Datum),
958 : 12096 : compare_scalars_simple, &ssup[dim]);
959 : :
1431 tgl@sss.pgh.pa.us 960 [ - + ]: 12096 : Assert(value != NULL); /* serialization or deduplication
961 : : * error */
962 : :
963 : : /* compute index within the deduplicated array */
1745 tomas.vondra@postgre 964 : 12096 : index = (uint16) (value - values[dim]);
965 : :
966 : : /* check the index is within expected bounds */
967 [ - + ]: 12096 : Assert(index < info[dim].nvalues);
968 : : }
969 : :
970 : : /* copy the index into the serialized MCV */
971 : 12129 : memcpy(ptr, &index, sizeof(uint16));
972 : 12129 : ptr += sizeof(uint16);
973 : : }
974 : :
975 : : /* make sure we don't overflow the allocated value */
976 [ - + ]: 4296 : Assert(ptr <= endptr);
977 : : }
978 : :
979 : : /* at this point we expect to match the total_length exactly */
980 [ - + ]: 90 : Assert(ptr == endptr);
981 : :
1845 982 : 90 : pfree(values);
983 : 90 : pfree(counts);
984 : :
1745 985 : 90 : return raw;
986 : : }
987 : :
988 : : /*
989 : : * statext_mcv_deserialize
990 : : * Reads serialized MCV list into MCVList structure.
991 : : *
992 : : * All the memory needed by the MCV list is allocated as a single chunk, so
993 : : * it's possible to simply pfree() it at once.
994 : : */
995 : : MCVList *
1845 996 : 246 : statext_mcv_deserialize(bytea *data)
997 : : {
998 : : int dim,
999 : : i;
1000 : : Size expected_size;
1001 : : MCVList *mcvlist;
1002 : : char *raw;
1003 : : char *ptr;
1004 : : char *endptr PG_USED_FOR_ASSERTS_ONLY;
1005 : :
1006 : : int ndims,
1007 : : nitems;
1008 : 246 : DimensionInfo *info = NULL;
1009 : :
1010 : : /* local allocation buffer (used only for deserialization) */
1844 1011 : 246 : Datum **map = NULL;
1012 : :
1013 : : /* MCV list */
1014 : : Size mcvlen;
1015 : :
1016 : : /* buffer used for the result */
1017 : : Size datalen;
1018 : : char *dataptr;
1019 : : char *valuesptr;
1020 : : char *isnullptr;
1021 : :
1845 1022 [ - + ]: 246 : if (data == NULL)
1845 tomas.vondra@postgre 1023 :UBC 0 : return NULL;
1024 : :
1025 : : /*
1026 : : * We can't possibly deserialize a MCV list if there's not even a complete
1027 : : * header. We need an explicit formula here, because we serialize the
1028 : : * header fields one by one, so we need to ignore struct alignment.
1029 : : */
1838 tomas.vondra@postgre 1030 [ - + - - :CBC 246 : if (VARSIZE_ANY(data) < MinSizeOfMCVList)
- - - - -
+ - + ]
415 peter@eisentraut.org 1031 [ # # # # :UBC 0 : elog(ERROR, "invalid MCV size %zu (expected at least %zu)",
# # # # #
# # # ]
1032 : : VARSIZE_ANY(data), MinSizeOfMCVList);
1033 : :
1034 : : /* read the MCV list header */
1844 tomas.vondra@postgre 1035 :CBC 246 : mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
1036 : :
1037 : : /* pointer to the data part (skip the varlena header) */
1838 1038 : 246 : raw = (char *) data;
1745 1039 [ - + ]: 246 : ptr = VARDATA_ANY(raw);
1040 [ - + - - : 246 : endptr = (char *) raw + VARSIZE_ANY(data);
- - - - -
+ ]
1041 : :
1042 : : /* get the header and perform further sanity checks */
1838 1043 : 246 : memcpy(&mcvlist->magic, ptr, sizeof(uint32));
1044 : 246 : ptr += sizeof(uint32);
1045 : :
1046 : 246 : memcpy(&mcvlist->type, ptr, sizeof(uint32));
1047 : 246 : ptr += sizeof(uint32);
1048 : :
1049 : 246 : memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
1050 : 246 : ptr += sizeof(uint32);
1051 : :
1052 : 246 : memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
1053 : 246 : ptr += sizeof(AttrNumber);
1054 : :
1845 1055 [ - + ]: 246 : if (mcvlist->magic != STATS_MCV_MAGIC)
1845 tomas.vondra@postgre 1056 [ # # ]:UBC 0 : elog(ERROR, "invalid MCV magic %u (expected %u)",
1057 : : mcvlist->magic, STATS_MCV_MAGIC);
1058 : :
1845 tomas.vondra@postgre 1059 [ - + ]:CBC 246 : if (mcvlist->type != STATS_MCV_TYPE_BASIC)
1845 tomas.vondra@postgre 1060 [ # # ]:UBC 0 : elog(ERROR, "invalid MCV type %u (expected %u)",
1061 : : mcvlist->type, STATS_MCV_TYPE_BASIC);
1062 : :
1845 tomas.vondra@postgre 1063 [ - + ]:CBC 246 : if (mcvlist->ndimensions == 0)
1845 tomas.vondra@postgre 1064 [ # # ]:UBC 0 : elog(ERROR, "invalid zero-length dimension array in MCVList");
1845 tomas.vondra@postgre 1065 [ + - ]:CBC 246 : else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
1066 [ - + ]: 246 : (mcvlist->ndimensions < 0))
1845 tomas.vondra@postgre 1067 [ # # ]:UBC 0 : elog(ERROR, "invalid length (%d) dimension array in MCVList",
1068 : : mcvlist->ndimensions);
1069 : :
1845 tomas.vondra@postgre 1070 [ - + ]:CBC 246 : if (mcvlist->nitems == 0)
1845 tomas.vondra@postgre 1071 [ # # ]:UBC 0 : elog(ERROR, "invalid zero-length item array in MCVList");
1845 tomas.vondra@postgre 1072 [ - + ]:CBC 246 : else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
1845 tomas.vondra@postgre 1073 [ # # ]:UBC 0 : elog(ERROR, "invalid length (%u) item array in MCVList",
1074 : : mcvlist->nitems);
1075 : :
1845 tomas.vondra@postgre 1076 :CBC 246 : nitems = mcvlist->nitems;
1077 : 246 : ndims = mcvlist->ndimensions;
1078 : :
1079 : : /*
1080 : : * Check amount of data including DimensionInfo for all dimensions and
1081 : : * also the serialized items (including uint16 indexes). Also, walk
1082 : : * through the dimension information and add it to the sum.
1083 : : */
1838 1084 : 246 : expected_size = SizeOfMCVList(ndims, nitems);
1085 : :
1086 : : /*
1087 : : * Check that we have at least the dimension and info records, along with
1088 : : * the items. We don't know the size of the serialized values yet. We need
1089 : : * to do this check first, before accessing the dimension info.
1090 : : */
1091 [ - + - - : 246 : if (VARSIZE_ANY(data) < expected_size)
- - - - -
+ - + ]
415 peter@eisentraut.org 1092 [ # # # # :UBC 0 : elog(ERROR, "invalid MCV size %zu (expected %zu)",
# # # # #
# # # ]
1093 : : VARSIZE_ANY(data), expected_size);
1094 : :
1095 : : /* Now copy the array of type Oids. */
1825 tomas.vondra@postgre 1096 :CBC 246 : memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
1838 1097 : 246 : ptr += (sizeof(Oid) * ndims);
1098 : :
1099 : : /* Now it's safe to access the dimension info. */
1745 1100 : 246 : info = palloc(ndims * sizeof(DimensionInfo));
1101 : :
1102 : 246 : memcpy(info, ptr, ndims * sizeof(DimensionInfo));
1103 : 246 : ptr += (ndims * sizeof(DimensionInfo));
1104 : :
1105 : : /* account for the value arrays */
1845 1106 [ + + ]: 1059 : for (dim = 0; dim < ndims; dim++)
1107 : : {
1108 : : /*
1109 : : * XXX I wonder if we can/should rely on asserts here. Maybe those
1110 : : * checks should be done every time?
1111 : : */
1112 [ - + ]: 813 : Assert(info[dim].nvalues >= 0);
1113 [ - + ]: 813 : Assert(info[dim].nbytes >= 0);
1114 : :
1745 1115 : 813 : expected_size += info[dim].nbytes;
1116 : : }
1117 : :
1118 : : /*
1119 : : * Now we know the total expected MCV size, including all the pieces
1120 : : * (header, dimension info. items and deduplicated data). So do the final
1121 : : * check on size.
1122 : : */
1838 1123 [ - + - - : 246 : if (VARSIZE_ANY(data) != expected_size)
- - - - -
+ - + ]
415 peter@eisentraut.org 1124 [ # # # # :UBC 0 : elog(ERROR, "invalid MCV size %zu (expected %zu)",
# # # # #
# # # ]
1125 : : VARSIZE_ANY(data), expected_size);
1126 : :
1127 : : /*
1128 : : * We need an array of Datum values for each dimension, so that we can
1129 : : * easily translate the uint16 indexes later. We also need a top-level
1130 : : * array of pointers to those per-dimension arrays.
1131 : : *
1132 : : * While allocating the arrays for dimensions, compute how much space we
1133 : : * need for a copy of the by-ref data, as we can't simply point to the
1134 : : * original values (it might go away).
1135 : : */
1844 tomas.vondra@postgre 1136 :CBC 246 : datalen = 0; /* space for by-ref data */
1840 michael@paquier.xyz 1137 : 246 : map = (Datum **) palloc(ndims * sizeof(Datum *));
1138 : :
1845 tomas.vondra@postgre 1139 [ + + ]: 1059 : for (dim = 0; dim < ndims; dim++)
1140 : : {
1844 1141 : 813 : map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
1142 : :
1143 : : /* space needed for a copy of data for by-ref types */
1745 1144 : 813 : datalen += info[dim].nbytes_aligned;
1145 : : }
1146 : :
1147 : : /*
1148 : : * Now resize the MCV list so that the allocation includes all the data.
1149 : : *
1150 : : * Allocate space for a copy of the data, as we can't simply reference the
1151 : : * serialized data - it's not aligned properly, and it may disappear while
1152 : : * we're still using the MCV list, e.g. due to catcache release.
1153 : : *
1154 : : * We do care about alignment here, because we will allocate all the
1155 : : * pieces at once, but then use pointers to different parts.
1156 : : */
1843 1157 : 246 : mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1158 : :
1159 : : /* arrays of values and isnull flags for all MCV items */
1842 1160 : 246 : mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
1161 : 246 : mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
1162 : :
1163 : : /* we don't quite need to align this, but it makes some asserts easier */
1843 1164 : 246 : mcvlen += MAXALIGN(datalen);
1165 : :
1166 : : /* now resize the deserialized MCV list, and compute pointers to parts */
1844 1167 : 246 : mcvlist = repalloc(mcvlist, mcvlen);
1168 : :
1169 : : /* pointer to the beginning of values/isnull arrays */
1843 1170 : 246 : valuesptr = (char *) mcvlist
1171 : 246 : + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
1172 : :
1842 1173 : 246 : isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
1174 : :
1175 : 246 : dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
1176 : :
1177 : : /*
1178 : : * Build mapping (index => value) for translating the serialized data into
1179 : : * the in-memory representation.
1180 : : */
1845 1181 [ + + ]: 1059 : for (dim = 0; dim < ndims; dim++)
1182 : : {
1183 : : /* remember start position in the input array */
1844 1184 : 813 : char *start PG_USED_FOR_ASSERTS_ONLY = ptr;
1185 : :
1845 1186 [ + + ]: 813 : if (info[dim].typbyval)
1187 : : {
1188 : : /* for by-val types we simply copy data into the mapping */
1844 1189 [ + + ]: 21963 : for (i = 0; i < info[dim].nvalues; i++)
1190 : : {
1191 : 21429 : Datum v = 0;
1192 : :
1193 : 21429 : memcpy(&v, ptr, info[dim].typlen);
1194 : 21429 : ptr += info[dim].typlen;
1195 : :
1196 : 21429 : map[dim][i] = fetch_att(&v, true, info[dim].typlen);
1197 : :
1198 : : /* no under/overflow of input array */
1199 [ - + ]: 21429 : Assert(ptr <= (start + info[dim].nbytes));
1200 : : }
1201 : : }
1202 : : else
1203 : : {
1204 : : /* for by-ref types we need to also make a copy of the data */
1205 : :
1206 : : /* passed by reference, but fixed length (name, tid, ...) */
1845 1207 [ + + ]: 279 : if (info[dim].typlen > 0)
1208 : : {
1209 [ + + ]: 1101 : for (i = 0; i < info[dim].nvalues; i++)
1210 : : {
1844 1211 : 1080 : memcpy(dataptr, ptr, info[dim].typlen);
1212 : 1080 : ptr += info[dim].typlen;
1213 : :
1214 : : /* just point into the array */
1215 : 1080 : map[dim][i] = PointerGetDatum(dataptr);
1745 1216 : 1080 : dataptr += MAXALIGN(info[dim].typlen);
1217 : : }
1218 : : }
1845 1219 [ + - ]: 258 : else if (info[dim].typlen == -1)
1220 : : {
1221 : : /* varlena */
1222 [ + + ]: 7494 : for (i = 0; i < info[dim].nvalues; i++)
1223 : : {
1224 : : uint32 len;
1225 : :
1226 : : /* read the uint32 length */
1745 1227 : 7236 : memcpy(&len, ptr, sizeof(uint32));
1228 : 7236 : ptr += sizeof(uint32);
1229 : :
1230 : : /* the length is data-only */
1231 : 7236 : SET_VARSIZE(dataptr, len + VARHDRSZ);
1232 : 7236 : memcpy(VARDATA(dataptr), ptr, len);
1233 : 7236 : ptr += len;
1234 : :
1235 : : /* just point into the array */
1844 1236 : 7236 : map[dim][i] = PointerGetDatum(dataptr);
1237 : :
1238 : : /* skip to place of the next deserialized value */
1745 1239 : 7236 : dataptr += MAXALIGN(len + VARHDRSZ);
1240 : : }
1241 : : }
1845 tomas.vondra@postgre 1242 [ # # ]:UBC 0 : else if (info[dim].typlen == -2)
1243 : : {
1244 : : /* cstring */
1245 [ # # ]: 0 : for (i = 0; i < info[dim].nvalues; i++)
1246 : : {
1247 : : uint32 len;
1248 : :
1745 1249 : 0 : memcpy(&len, ptr, sizeof(uint32));
1250 : 0 : ptr += sizeof(uint32);
1251 : :
1844 1252 : 0 : memcpy(dataptr, ptr, len);
1745 1253 : 0 : ptr += len;
1254 : :
1255 : : /* just point into the array */
1844 1256 : 0 : map[dim][i] = PointerGetDatum(dataptr);
1843 1257 : 0 : dataptr += MAXALIGN(len);
1258 : : }
1259 : : }
1260 : :
1261 : : /* no under/overflow of input array */
1844 tomas.vondra@postgre 1262 [ - + ]:CBC 279 : Assert(ptr <= (start + info[dim].nbytes));
1263 : :
1264 : : /* no overflow of the output mcv value */
1265 [ - + ]: 279 : Assert(dataptr <= ((char *) mcvlist + mcvlen));
1266 : : }
1267 : :
1268 : : /* check we consumed input data for this dimension exactly */
1269 [ - + ]: 813 : Assert(ptr == (start + info[dim].nbytes));
1270 : : }
1271 : :
1272 : : /* we should have also filled the MCV list exactly */
1273 [ - + ]: 246 : Assert(dataptr == ((char *) mcvlist + mcvlen));
1274 : :
1275 : : /* deserialize the MCV items and translate the indexes to Datums */
1845 1276 [ + + ]: 15219 : for (i = 0; i < nitems; i++)
1277 : : {
1844 1278 : 14973 : MCVItem *item = &mcvlist->items[i];
1279 : :
1280 : 14973 : item->values = (Datum *) valuesptr;
1842 1281 : 14973 : valuesptr += MAXALIGN(sizeof(Datum) * ndims);
1282 : :
1283 : 14973 : item->isnull = (bool *) isnullptr;
1284 : 14973 : isnullptr += MAXALIGN(sizeof(bool) * ndims);
1285 : :
1745 1286 : 14973 : memcpy(item->isnull, ptr, sizeof(bool) * ndims);
1287 : 14973 : ptr += sizeof(bool) * ndims;
1288 : :
1289 : 14973 : memcpy(&item->frequency, ptr, sizeof(double));
1290 : 14973 : ptr += sizeof(double);
1291 : :
1292 : 14973 : memcpy(&item->base_frequency, ptr, sizeof(double));
1293 : 14973 : ptr += sizeof(double);
1294 : :
1295 : : /* finally translate the indexes (for non-NULL only) */
1845 1296 [ + + ]: 66894 : for (dim = 0; dim < ndims; dim++)
1297 : : {
1298 : : uint16 index;
1299 : :
1745 1300 : 51921 : memcpy(&index, ptr, sizeof(uint16));
1301 : 51921 : ptr += sizeof(uint16);
1302 : :
1303 [ + + ]: 51921 : if (item->isnull[dim])
1304 : 153 : continue;
1305 : :
1306 : 51768 : item->values[dim] = map[dim][index];
1307 : : }
1308 : :
1309 : : /* check we're not overflowing the input */
1310 [ - + ]: 14973 : Assert(ptr <= endptr);
1311 : : }
1312 : :
1313 : : /* check that we processed all the data */
1314 [ - + ]: 246 : Assert(ptr == endptr);
1315 : :
1316 : : /* release the buffers used for mapping */
1844 1317 [ + + ]: 1059 : for (dim = 0; dim < ndims; dim++)
1318 : 813 : pfree(map[dim]);
1319 : :
1320 : 246 : pfree(map);
1321 : :
1845 1322 : 246 : return mcvlist;
1323 : : }
1324 : :
1325 : : /*
1326 : : * SRF with details about buckets of a histogram:
1327 : : *
1328 : : * - item ID (0...nitems)
1329 : : * - values (string array)
1330 : : * - nulls only (boolean array)
1331 : : * - frequency (double precision)
1332 : : * - base_frequency (double precision)
1333 : : *
1334 : : * The input is the OID of the statistics, and there are no rows returned if
1335 : : * the statistics contains no histogram.
1336 : : */
1337 : : Datum
1338 : 15 : pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
1339 : : {
1340 : : FuncCallContext *funcctx;
1341 : :
1342 : : /* stuff done only on the first call of the function */
1343 [ + + ]: 15 : if (SRF_IS_FIRSTCALL())
1344 : : {
1345 : : MemoryContext oldcontext;
1346 : : MCVList *mcvlist;
1347 : : TupleDesc tupdesc;
1348 : :
1349 : : /* create a function context for cross-call persistence */
1350 : 6 : funcctx = SRF_FIRSTCALL_INIT();
1351 : :
1352 : : /* switch to memory context appropriate for multiple function calls */
1353 : 6 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1354 : :
1355 : 6 : mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
1356 : :
1357 : 6 : funcctx->user_fctx = mcvlist;
1358 : :
1359 : : /* total number of tuples to be returned */
1360 : 6 : funcctx->max_calls = 0;
1361 [ + - ]: 6 : if (funcctx->user_fctx != NULL)
1362 : 6 : funcctx->max_calls = mcvlist->nitems;
1363 : :
1364 : : /* Build a tuple descriptor for our result type */
1365 [ - + ]: 6 : if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
1845 tomas.vondra@postgre 1366 [ # # ]:UBC 0 : ereport(ERROR,
1367 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1368 : : errmsg("function returning record called in context "
1369 : : "that cannot accept type record")));
1746 tomas.vondra@postgre 1370 :CBC 6 : tupdesc = BlessTupleDesc(tupdesc);
1371 : :
1372 : : /*
1373 : : * generate attribute metadata needed later to produce tuples from raw
1374 : : * C strings
1375 : : */
1376 : 6 : funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
1377 : :
1845 1378 : 6 : MemoryContextSwitchTo(oldcontext);
1379 : : }
1380 : :
1381 : : /* stuff done on every call of the function */
1382 : 15 : funcctx = SRF_PERCALL_SETUP();
1383 : :
1431 tgl@sss.pgh.pa.us 1384 [ + + ]: 15 : if (funcctx->call_cntr < funcctx->max_calls) /* do when there is more
1385 : : * left to send */
1386 : : {
1387 : : Datum values[5];
1388 : : bool nulls[5];
1389 : : HeapTuple tuple;
1390 : : Datum result;
1746 tomas.vondra@postgre 1391 : 9 : ArrayBuildState *astate_values = NULL;
1392 : 9 : ArrayBuildState *astate_nulls = NULL;
1393 : :
1394 : : int i;
1395 : : MCVList *mcvlist;
1396 : : MCVItem *item;
1397 : :
1845 1398 : 9 : mcvlist = (MCVList *) funcctx->user_fctx;
1399 : :
1746 1400 [ - + ]: 9 : Assert(funcctx->call_cntr < mcvlist->nitems);
1401 : :
1402 : 9 : item = &mcvlist->items[funcctx->call_cntr];
1403 : :
1845 1404 [ + + ]: 36 : for (i = 0; i < mcvlist->ndimensions; i++)
1405 : : {
1406 : :
1746 1407 : 27 : astate_nulls = accumArrayResult(astate_nulls,
1431 tgl@sss.pgh.pa.us 1408 : 27 : BoolGetDatum(item->isnull[i]),
1409 : : false,
1410 : : BOOLOID,
1411 : : CurrentMemoryContext);
1412 : :
1746 tomas.vondra@postgre 1413 [ + + ]: 27 : if (!item->isnull[i])
1414 : : {
1415 : : bool isvarlena;
1416 : : Oid outfunc;
1417 : : FmgrInfo fmgrinfo;
1418 : : Datum val;
1419 : : text *txt;
1420 : :
1421 : : /* lookup output func for the type */
1422 : 15 : getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
1423 : 15 : fmgr_info(outfunc, &fmgrinfo);
1424 : :
1425 : 15 : val = FunctionCall1(&fmgrinfo, item->values[i]);
1426 : 15 : txt = cstring_to_text(DatumGetPointer(val));
1427 : :
1428 : 15 : astate_values = accumArrayResult(astate_values,
1429 : : PointerGetDatum(txt),
1430 : : false,
1431 : : TEXTOID,
1432 : : CurrentMemoryContext);
1433 : : }
1434 : : else
1435 : 12 : astate_values = accumArrayResult(astate_values,
1436 : : (Datum) 0,
1437 : : true,
1438 : : TEXTOID,
1439 : : CurrentMemoryContext);
1440 : : }
1441 : :
1442 : 9 : values[0] = Int32GetDatum(funcctx->call_cntr);
595 peter@eisentraut.org 1443 : 9 : values[1] = makeArrayResult(astate_values, CurrentMemoryContext);
1444 : 9 : values[2] = makeArrayResult(astate_nulls, CurrentMemoryContext);
1746 tomas.vondra@postgre 1445 : 9 : values[3] = Float8GetDatum(item->frequency);
1446 : 9 : values[4] = Float8GetDatum(item->base_frequency);
1447 : :
1448 : : /* no NULLs in the tuple */
1449 : 9 : memset(nulls, 0, sizeof(nulls));
1450 : :
1451 : : /* build a tuple */
1452 : 9 : tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
1453 : :
1454 : : /* make the tuple into a datum */
1845 1455 : 9 : result = HeapTupleGetDatum(tuple);
1456 : :
1457 : 9 : SRF_RETURN_NEXT(funcctx, result);
1458 : : }
1459 : : else /* do when there is no more left */
1460 : : {
1461 : 6 : SRF_RETURN_DONE(funcctx);
1462 : : }
1463 : : }
1464 : :
1465 : : /*
1466 : : * pg_mcv_list_in - input routine for type pg_mcv_list.
1467 : : *
1468 : : * pg_mcv_list is real enough to be a table column, but it has no operations
1469 : : * of its own, and disallows input too
1470 : : */
1471 : : Datum
1845 tomas.vondra@postgre 1472 :UBC 0 : pg_mcv_list_in(PG_FUNCTION_ARGS)
1473 : : {
1474 : : /*
1475 : : * pg_mcv_list stores the data in binary form and parsing text input is
1476 : : * not needed, so disallow this.
1477 : : */
1478 [ # # ]: 0 : ereport(ERROR,
1479 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1480 : : errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1481 : :
1482 : : PG_RETURN_VOID(); /* keep compiler quiet */
1483 : : }
1484 : :
1485 : :
1486 : : /*
1487 : : * pg_mcv_list_out - output routine for type pg_mcv_list.
1488 : : *
1489 : : * MCV lists are serialized into a bytea value, so we simply call byteaout()
1490 : : * to serialize the value into text. But it'd be nice to serialize that into
1491 : : * a meaningful representation (e.g. for inspection by people).
1492 : : *
1493 : : * XXX This should probably return something meaningful, similar to what
1494 : : * pg_dependencies_out does. Not sure how to deal with the deduplicated
1495 : : * values, though - do we want to expand that or not?
1496 : : */
1497 : : Datum
1498 : 0 : pg_mcv_list_out(PG_FUNCTION_ARGS)
1499 : : {
1500 : 0 : return byteaout(fcinfo);
1501 : : }
1502 : :
1503 : : /*
1504 : : * pg_mcv_list_recv - binary input routine for type pg_mcv_list.
1505 : : */
1506 : : Datum
1507 : 0 : pg_mcv_list_recv(PG_FUNCTION_ARGS)
1508 : : {
1509 [ # # ]: 0 : ereport(ERROR,
1510 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1511 : : errmsg("cannot accept a value of type %s", "pg_mcv_list")));
1512 : :
1513 : : PG_RETURN_VOID(); /* keep compiler quiet */
1514 : : }
1515 : :
1516 : : /*
1517 : : * pg_mcv_list_send - binary output routine for type pg_mcv_list.
1518 : : *
1519 : : * MCV lists are serialized in a bytea value (although the type is named
1520 : : * differently), so let's just send that.
1521 : : */
1522 : : Datum
1523 : 0 : pg_mcv_list_send(PG_FUNCTION_ARGS)
1524 : : {
1525 : 0 : return byteasend(fcinfo);
1526 : : }
1527 : :
1528 : : /*
1529 : : * match the attribute/expression to a dimension of the statistic
1530 : : *
1531 : : * Returns the zero-based index of the matching statistics dimension.
1532 : : * Optionally determines the collation.
1533 : : */
1534 : : static int
1115 tomas.vondra@postgre 1535 :CBC 576 : mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
1536 : : {
1537 : : int idx;
1538 : :
1539 [ + + ]: 576 : if (IsA(expr, Var))
1540 : : {
1541 : : /* simple Var, so just lookup using varattno */
1542 : 453 : Var *var = (Var *) expr;
1543 : :
1544 [ + + ]: 453 : if (collid)
1545 : 420 : *collid = var->varcollid;
1546 : :
1547 : 453 : idx = bms_member_index(keys, var->varattno);
1548 : :
618 tgl@sss.pgh.pa.us 1549 [ - + ]: 453 : if (idx < 0)
618 tgl@sss.pgh.pa.us 1550 [ # # ]:UBC 0 : elog(ERROR, "variable not found in statistics object");
1551 : : }
1552 : : else
1553 : : {
1554 : : /* expression - lookup in stats expressions */
1555 : : ListCell *lc;
1556 : :
1115 tomas.vondra@postgre 1557 [ + + ]:CBC 123 : if (collid)
1558 : 120 : *collid = exprCollation(expr);
1559 : :
1560 : : /* expressions are stored after the simple columns */
618 tgl@sss.pgh.pa.us 1561 : 123 : idx = bms_num_members(keys);
1115 tomas.vondra@postgre 1562 [ + - + - : 228 : foreach(lc, exprs)
+ - ]
1563 : : {
1564 : 228 : Node *stat_expr = (Node *) lfirst(lc);
1565 : :
1566 [ + + ]: 228 : if (equal(expr, stat_expr))
1567 : 123 : break;
1568 : :
1569 : 105 : idx++;
1570 : : }
1571 : :
618 tgl@sss.pgh.pa.us 1572 [ - + ]: 123 : if (lc == NULL)
618 tgl@sss.pgh.pa.us 1573 [ # # ]:UBC 0 : elog(ERROR, "expression not found in statistics object");
1574 : : }
1575 : :
1115 tomas.vondra@postgre 1576 :CBC 576 : return idx;
1577 : : }
1578 : :
1579 : : /*
1580 : : * mcv_get_match_bitmap
1581 : : * Evaluate clauses using the MCV list, and update the match bitmap.
1582 : : *
1583 : : * A match bitmap keeps match/mismatch status for each MCV item, and we
1584 : : * update it based on additional clauses. We also use it to skip items
1585 : : * that can't possibly match (e.g. item marked as "mismatch" can't change
1586 : : * to "match" when evaluating AND clause list).
1587 : : *
1588 : : * The function also returns a flag indicating whether there was an
1589 : : * equality condition for all attributes, the minimum frequency in the MCV
1590 : : * list, and a total MCV frequency (sum of frequencies for all items).
1591 : : *
1592 : : * XXX Currently the match bitmap uses a bool for each MCV item, which is
1593 : : * somewhat wasteful as we could do with just a single bit, thus reducing
1594 : : * the size to ~1/8. It would also allow us to combine bitmaps simply using
1595 : : * & and |, which should be faster than min/max. The bitmaps are fairly
1596 : : * small, though (thanks to the cap on the MCV list size).
1597 : : */
1598 : : static bool *
1845 1599 : 357 : mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
1600 : : Bitmapset *keys, List *exprs,
1601 : : MCVList *mcvlist, bool is_or)
1602 : : {
1603 : : ListCell *l;
1604 : : bool *matches;
1605 : :
1606 : : /* The bitmap may be partially built. */
1607 [ - + ]: 357 : Assert(clauses != NIL);
1608 [ - + ]: 357 : Assert(mcvlist != NULL);
1609 [ - + ]: 357 : Assert(mcvlist->nitems > 0);
1610 [ - + ]: 357 : Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
1611 : :
1612 : 357 : matches = palloc(sizeof(bool) * mcvlist->nitems);
810 michael@paquier.xyz 1613 : 357 : memset(matches, !is_or, sizeof(bool) * mcvlist->nitems);
1614 : :
1615 : : /*
1616 : : * Loop through the list of clauses, and for each of them evaluate all the
1617 : : * MCV items not yet eliminated by the preceding clauses.
1618 : : */
1845 tomas.vondra@postgre 1619 [ + - + + : 1017 : foreach(l, clauses)
+ + ]
1620 : : {
1621 : 660 : Node *clause = (Node *) lfirst(l);
1622 : :
1623 : : /* if it's a RestrictInfo, then extract the clause */
1624 [ + + ]: 660 : if (IsA(clause, RestrictInfo))
1625 : 609 : clause = (Node *) ((RestrictInfo *) clause)->clause;
1626 : :
1627 : : /*
1628 : : * Handle the various types of clauses - OpClause, NullTest and
1629 : : * AND/OR/NOT
1630 : : */
1631 [ + + ]: 660 : if (is_opclause(clause))
1632 : : {
1633 : 438 : OpExpr *expr = (OpExpr *) clause;
1634 : : FmgrInfo opproc;
1635 : :
1636 : : /* valid only after examine_opclause_args returns true */
1637 : : Node *clause_expr;
1638 : : Const *cst;
1639 : : bool expronleft;
1640 : : int idx;
1641 : : Oid collid;
1642 : :
1737 1643 : 438 : fmgr_info(get_opcode(expr->opno), &opproc);
1644 : :
1645 : : /* extract the var/expr and const from the expression */
1115 1646 [ - + ]: 438 : if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
1115 tomas.vondra@postgre 1647 [ # # ]:UBC 0 : elog(ERROR, "incompatible clause");
1648 : :
1649 : : /* match the attribute/expression to a dimension of the statistic */
1115 tomas.vondra@postgre 1650 :CBC 438 : idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
1651 : :
1652 : : /*
1653 : : * Walk through the MCV items and evaluate the current clause. We
1654 : : * can skip items that were already ruled out, and terminate if
1655 : : * there are no remaining MCV items that might possibly match.
1656 : : */
599 drowley@postgresql.o 1657 [ + + ]: 29316 : for (int i = 0; i < mcvlist->nitems; i++)
1658 : : {
1115 tomas.vondra@postgre 1659 : 28878 : bool match = true;
1660 : 28878 : MCVItem *item = &mcvlist->items[i];
1661 : :
1662 [ - + ]: 28878 : Assert(idx >= 0);
1663 : :
1664 : : /*
1665 : : * When the MCV item or the Const value is NULL we can treat
1666 : : * this as a mismatch. We must not call the operator because
1667 : : * of strictness.
1668 : : */
1669 [ + + - + ]: 28878 : if (item->isnull[idx] || cst->constisnull)
1670 : : {
1671 [ + + - + ]: 24 : matches[i] = RESULT_MERGE(matches[i], is_or, false);
1672 : 24 : continue;
1673 : : }
1674 : :
1675 : : /*
1676 : : * Skip MCV items that can't change result in the bitmap. Once
1677 : : * the value gets false for AND-lists, or true for OR-lists,
1678 : : * we don't need to look at more clauses.
1679 : : */
1680 [ + + + + ]: 28854 : if (RESULT_IS_FINAL(matches[i], is_or))
1681 : 13482 : continue;
1682 : :
1683 : : /*
1684 : : * First check whether the constant is below the lower
1685 : : * boundary (in that case we can skip the bucket, because
1686 : : * there's no overlap).
1687 : : *
1688 : : * We don't store collations used to build the statistics, but
1689 : : * we can use the collation for the attribute itself, as
1690 : : * stored in varcollid. We do reset the statistics after a
1691 : : * type change (including collation change), so this is OK.
1692 : : * For expressions, we use the collation extracted from the
1693 : : * expression itself.
1694 : : */
1695 [ + + ]: 15372 : if (expronleft)
1696 : 14064 : match = DatumGetBool(FunctionCall2Coll(&opproc,
1697 : : collid,
1698 : 14064 : item->values[idx],
1699 : 14064 : cst->constvalue));
1700 : : else
1701 : 1308 : match = DatumGetBool(FunctionCall2Coll(&opproc,
1702 : : collid,
1703 : 1308 : cst->constvalue,
1704 : 1308 : item->values[idx]));
1705 : :
1706 : : /* update the match bitmap with the result */
1707 [ + + + - : 15372 : matches[i] = RESULT_MERGE(matches[i], is_or, match);
+ - + - +
+ ]
1708 : : }
1709 : : }
1492 1710 [ + + ]: 222 : else if (IsA(clause, ScalarArrayOpExpr))
1711 : : {
1431 tgl@sss.pgh.pa.us 1712 : 102 : ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
1713 : : FmgrInfo opproc;
1714 : :
1715 : : /* valid only after examine_opclause_args returns true */
1716 : : Node *clause_expr;
1717 : : Const *cst;
1718 : : bool expronleft;
1719 : : Oid collid;
1720 : : int idx;
1721 : :
1722 : : /* array evaluation */
1723 : : ArrayType *arrayval;
1724 : : int16 elmlen;
1725 : : bool elmbyval;
1726 : : char elmalign;
1727 : : int num_elems;
1728 : : Datum *elem_values;
1729 : : bool *elem_nulls;
1730 : :
1492 tomas.vondra@postgre 1731 : 102 : fmgr_info(get_opcode(expr->opno), &opproc);
1732 : :
1733 : : /* extract the var/expr and const from the expression */
1115 1734 [ - + ]: 102 : if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
1115 tomas.vondra@postgre 1735 [ # # ]:UBC 0 : elog(ERROR, "incompatible clause");
1736 : :
1737 : : /* We expect Var on left */
618 tgl@sss.pgh.pa.us 1738 [ - + ]:CBC 102 : if (!expronleft)
618 tgl@sss.pgh.pa.us 1739 [ # # ]:UBC 0 : elog(ERROR, "incompatible clause");
1740 : :
1741 : : /*
1742 : : * Deconstruct the array constant, unless it's NULL (we'll cover
1743 : : * that case below)
1744 : : */
618 tgl@sss.pgh.pa.us 1745 [ + - ]:CBC 102 : if (!cst->constisnull)
1746 : : {
1747 : 102 : arrayval = DatumGetArrayTypeP(cst->constvalue);
1748 : 102 : get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
1749 : : &elmlen, &elmbyval, &elmalign);
1750 : 102 : deconstruct_array(arrayval,
1751 : : ARR_ELEMTYPE(arrayval),
1752 : : elmlen, elmbyval, elmalign,
1753 : : &elem_values, &elem_nulls, &num_elems);
1754 : : }
1755 : :
1756 : : /* match the attribute/expression to a dimension of the statistic */
1115 tomas.vondra@postgre 1757 : 102 : idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
1758 : :
1759 : : /*
1760 : : * Walk through the MCV items and evaluate the current clause. We
1761 : : * can skip items that were already ruled out, and terminate if
1762 : : * there are no remaining MCV items that might possibly match.
1763 : : */
599 drowley@postgresql.o 1764 [ + + ]: 7848 : for (int i = 0; i < mcvlist->nitems; i++)
1765 : : {
1766 : : int j;
949 michael@paquier.xyz 1767 : 7746 : bool match = !expr->useOr;
1115 tomas.vondra@postgre 1768 : 7746 : MCVItem *item = &mcvlist->items[i];
1769 : :
1770 : : /*
1771 : : * When the MCV item or the Const value is NULL we can treat
1772 : : * this as a mismatch. We must not call the operator because
1773 : : * of strictness.
1774 : : */
1775 [ + + - + ]: 7746 : if (item->isnull[idx] || cst->constisnull)
1776 : : {
1777 [ - + - - ]: 9 : matches[i] = RESULT_MERGE(matches[i], is_or, false);
1778 : 9 : continue;
1779 : : }
1780 : :
1781 : : /*
1782 : : * Skip MCV items that can't change result in the bitmap. Once
1783 : : * the value gets false for AND-lists, or true for OR-lists,
1784 : : * we don't need to look at more clauses.
1785 : : */
1786 [ + + + + ]: 7737 : if (RESULT_IS_FINAL(matches[i], is_or))
1787 : 4008 : continue;
1788 : :
1789 [ + + ]: 14169 : for (j = 0; j < num_elems; j++)
1790 : : {
1791 : 11817 : Datum elem_value = elem_values[j];
1792 : 11817 : bool elem_isnull = elem_nulls[j];
1793 : : bool elem_match;
1794 : :
1795 : : /* NULL values always evaluate as not matching. */
1796 [ + + ]: 11817 : if (elem_isnull)
1797 : : {
1798 [ + - + + ]: 1056 : match = RESULT_MERGE(match, expr->useOr, false);
1492 1799 : 1056 : continue;
1800 : : }
1801 : :
1802 : : /*
1803 : : * Stop evaluating the array elements once we reach a
1804 : : * matching value that can't change - ALL() is the same as
1805 : : * AND-list, ANY() is the same as OR-list.
1806 : : */
1115 1807 [ + + + + ]: 10761 : if (RESULT_IS_FINAL(match, expr->useOr))
1808 : 1377 : break;
1809 : :
1810 : 9384 : elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
1811 : : collid,
1812 : 9384 : item->values[idx],
1813 : : elem_value));
1814 : :
1815 [ + + + - : 9384 : match = RESULT_MERGE(match, expr->useOr, elem_match);
+ + + - +
+ ]
1816 : : }
1817 : :
1818 : : /* update the match bitmap with the result */
1819 [ + + + - : 3729 : matches[i] = RESULT_MERGE(matches[i], is_or, match);
- + + - +
+ ]
1820 : : }
1821 : : }
1845 1822 [ + + ]: 120 : else if (IsA(clause, NullTest))
1823 : : {
1824 : 33 : NullTest *expr = (NullTest *) clause;
1115 1825 : 33 : Node *clause_expr = (Node *) (expr->arg);
1826 : :
1827 : : /* match the attribute/expression to a dimension of the statistic */
1828 : 33 : int idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
1829 : :
1830 : : /*
1831 : : * Walk through the MCV items and evaluate the current clause. We
1832 : : * can skip items that were already ruled out, and terminate if
1833 : : * there are no remaining MCV items that might possibly match.
1834 : : */
599 drowley@postgresql.o 1835 [ + + ]: 3039 : for (int i = 0; i < mcvlist->nitems; i++)
1836 : : {
1845 tomas.vondra@postgre 1837 : 3006 : bool match = false; /* assume mismatch */
1844 1838 : 3006 : MCVItem *item = &mcvlist->items[i];
1839 : :
1840 : : /* if the clause mismatches the MCV item, update the bitmap */
1845 1841 [ + + - ]: 3006 : switch (expr->nulltesttype)
1842 : : {
1843 : 2106 : case IS_NULL:
1844 [ + + ]: 2106 : match = (item->isnull[idx]) ? true : match;
1845 : 2106 : break;
1846 : :
1847 : 900 : case IS_NOT_NULL:
1848 [ + + ]: 900 : match = (!item->isnull[idx]) ? true : match;
1849 : 900 : break;
1850 : : }
1851 : :
1852 : : /* now, update the match bitmap, depending on OR/AND type */
1733 1853 [ - + - - : 3006 : matches[i] = RESULT_MERGE(matches[i], is_or, match);
- - + + +
+ ]
1854 : : }
1855 : : }
1845 1856 [ + + + + ]: 87 : else if (is_orclause(clause) || is_andclause(clause))
1857 : 30 : {
1858 : : /* AND/OR clause, with all subclauses being compatible */
1859 : :
1860 : : int i;
1861 : 30 : BoolExpr *bool_clause = ((BoolExpr *) clause);
1862 : 30 : List *bool_clauses = bool_clause->args;
1863 : :
1864 : : /* match/mismatch bitmap for each MCV item */
1865 : 30 : bool *bool_matches = NULL;
1866 : :
1867 [ - + ]: 30 : Assert(bool_clauses != NIL);
1868 [ - + ]: 30 : Assert(list_length(bool_clauses) >= 2);
1869 : :
1870 : : /* build the match bitmap for the OR-clauses */
1115 1871 : 30 : bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
1845 1872 : 30 : mcvlist, is_orclause(clause));
1873 : :
1874 : : /*
1875 : : * Merge the bitmap produced by mcv_get_match_bitmap into the
1876 : : * current one. We need to consider if we're evaluating AND or OR
1877 : : * condition when merging the results.
1878 : : */
1879 [ + + ]: 1878 : for (i = 0; i < mcvlist->nitems; i++)
1733 1880 [ - + - - : 1848 : matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
- - + - +
+ ]
1881 : :
1845 1882 : 30 : pfree(bool_matches);
1883 : : }
1884 [ + + ]: 57 : else if (is_notclause(clause))
1885 : : {
1886 : : /* NOT clause, with all subclauses compatible */
1887 : :
1888 : : int i;
1889 : 15 : BoolExpr *not_clause = ((BoolExpr *) clause);
1890 : 15 : List *not_args = not_clause->args;
1891 : :
1892 : : /* match/mismatch bitmap for each MCV item */
1893 : 15 : bool *not_matches = NULL;
1894 : :
1895 [ - + ]: 15 : Assert(not_args != NIL);
1896 [ - + ]: 15 : Assert(list_length(not_args) == 1);
1897 : :
1898 : : /* build the match bitmap for the NOT-clause */
1115 1899 : 15 : not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
1900 : : mcvlist, false);
1901 : :
1902 : : /*
1903 : : * Merge the bitmap produced by mcv_get_match_bitmap into the
1904 : : * current one. We're handling a NOT clause, so invert the result
1905 : : * before merging it into the global bitmap.
1906 : : */
1845 1907 [ + + ]: 75 : for (i = 0; i < mcvlist->nitems; i++)
1733 1908 [ - + - - : 60 : matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
- - + + +
+ ]
1909 : :
1845 1910 : 15 : pfree(not_matches);
1911 : : }
1912 [ + + ]: 42 : else if (IsA(clause, Var))
1913 : : {
1914 : : /* Var (has to be a boolean Var, possibly from below NOT) */
1915 : :
1916 : 39 : Var *var = (Var *) (clause);
1917 : :
1918 : : /* match the attribute to a dimension of the statistic */
1919 : 39 : int idx = bms_member_index(keys, var->varattno);
1920 : :
1921 [ - + ]: 39 : Assert(var->vartype == BOOLOID);
1922 : :
1923 : : /*
1924 : : * Walk through the MCV items and evaluate the current clause. We
1925 : : * can skip items that were already ruled out, and terminate if
1926 : : * there are no remaining MCV items that might possibly match.
1927 : : */
599 drowley@postgresql.o 1928 [ + + ]: 189 : for (int i = 0; i < mcvlist->nitems; i++)
1929 : : {
1844 tomas.vondra@postgre 1930 : 150 : MCVItem *item = &mcvlist->items[i];
1845 1931 : 150 : bool match = false;
1932 : :
1933 : : /* if the item is NULL, it's a mismatch */
1934 [ + - + + ]: 150 : if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
1935 : 75 : match = true;
1936 : :
1937 : : /* update the result bitmap */
1733 1938 [ + + + - : 150 : matches[i] = RESULT_MERGE(matches[i], is_or, match);
+ + + + +
+ ]
1939 : : }
1940 : : }
1941 : : else
1942 : : {
1943 : : /* Otherwise, it must be a bare boolean-returning expression */
1944 : : int idx;
1945 : :
1946 : : /* match the expression to a dimension of the statistic */
618 tgl@sss.pgh.pa.us 1947 : 3 : idx = mcv_match_expression(clause, keys, exprs, NULL);
1948 : :
1949 : : /*
1950 : : * Walk through the MCV items and evaluate the current clause. We
1951 : : * can skip items that were already ruled out, and terminate if
1952 : : * there are no remaining MCV items that might possibly match.
1953 : : */
599 drowley@postgresql.o 1954 [ + + ]: 111 : for (int i = 0; i < mcvlist->nitems; i++)
1955 : : {
1956 : : bool match;
618 tgl@sss.pgh.pa.us 1957 : 108 : MCVItem *item = &mcvlist->items[i];
1958 : :
1959 : : /* "match" just means it's bool TRUE */
1960 [ + - + + ]: 108 : match = !item->isnull[idx] && DatumGetBool(item->values[idx]);
1961 : :
1962 : : /* now, update the match bitmap, depending on OR/AND type */
1963 [ - + - - : 108 : matches[i] = RESULT_MERGE(matches[i], is_or, match);
- - + - +
+ ]
1964 : : }
1965 : : }
1966 : : }
1967 : :
1845 tomas.vondra@postgre 1968 : 357 : return matches;
1969 : : }
1970 : :
1971 : :
1972 : : /*
1973 : : * mcv_combine_selectivities
1974 : : * Combine per-column and multi-column MCV selectivity estimates.
1975 : : *
1976 : : * simple_sel is a "simple" selectivity estimate (produced without using any
1977 : : * extended statistics, essentially assuming independence of columns/clauses).
1978 : : *
1979 : : * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of
1980 : : * all matching MCV items. The difference (mcv_sel - mcv_basesel) is then
1981 : : * essentially interpreted as a correction to be added to simple_sel, as
1982 : : * described below.
1983 : : *
1984 : : * mcv_totalsel is the sum of the frequencies of all MCV items (not just the
1985 : : * matching ones). This is used as an upper bound on the portion of the
1986 : : * selectivity estimates not covered by the MCV statistics.
1987 : : *
1988 : : * Note: While simple and base selectivities are defined in a quite similar
1989 : : * way, the values are computed differently and are not therefore equal. The
1990 : : * simple selectivity is computed as a product of per-clause estimates, while
1991 : : * the base selectivity is computed by adding up base frequencies of matching
1992 : : * items of the multi-column MCV list. So the values may differ for two main
1993 : : * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
1994 : : * the MCV items did not match the estimated clauses.
1995 : : *
1996 : : * As both (a) and (b) reduce the base selectivity value, it generally holds
1997 : : * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the
1998 : : * values may be equal.
1999 : : *
2000 : : * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not
2001 : : * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a
2002 : : * correction for the part covered by the MCV list. Those two statements are
2003 : : * actually equivalent.
2004 : : */
2005 : : Selectivity
1228 dean.a.rasheed@gmail 2006 : 336 : mcv_combine_selectivities(Selectivity simple_sel,
2007 : : Selectivity mcv_sel,
2008 : : Selectivity mcv_basesel,
2009 : : Selectivity mcv_totalsel)
2010 : : {
2011 : : Selectivity other_sel;
2012 : : Selectivity sel;
2013 : :
2014 : : /* estimated selectivity of values not covered by MCV matches */
2015 : 336 : other_sel = simple_sel - mcv_basesel;
2016 [ + + - + ]: 336 : CLAMP_PROBABILITY(other_sel);
2017 : :
2018 : : /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */
2019 [ + + ]: 336 : if (other_sel > 1.0 - mcv_totalsel)
2020 : 222 : other_sel = 1.0 - mcv_totalsel;
2021 : :
2022 : : /* overall selectivity is the sum of the MCV and non-MCV parts */
2023 : 336 : sel = mcv_sel + other_sel;
2024 [ + + - + ]: 336 : CLAMP_PROBABILITY(sel);
2025 : :
2026 : 336 : return sel;
2027 : : }
2028 : :
2029 : :
2030 : : /*
2031 : : * mcv_clauselist_selectivity
2032 : : * Use MCV statistics to estimate the selectivity of an implicitly-ANDed
2033 : : * list of clauses.
2034 : : *
2035 : : * This determines which MCV items match every clause in the list and returns
2036 : : * the sum of the frequencies of those items.
2037 : : *
2038 : : * In addition, it returns the sum of the base frequencies of each of those
2039 : : * items (that is the sum of the selectivities that each item would have if
2040 : : * the columns were independent of one another), and the total selectivity of
2041 : : * all the MCV items (not just the matching ones). These are expected to be
2042 : : * used together with a "simple" selectivity estimate (one based only on
2043 : : * per-column statistics) to produce an overall selectivity estimate that
2044 : : * makes use of both per-column and multi-column statistics --- see
2045 : : * mcv_combine_selectivities().
2046 : : */
2047 : : Selectivity
1845 tomas.vondra@postgre 2048 : 192 : mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
2049 : : List *clauses, int varRelid,
2050 : : JoinType jointype, SpecialJoinInfo *sjinfo,
2051 : : RelOptInfo *rel,
2052 : : Selectivity *basesel, Selectivity *totalsel)
2053 : : {
2054 : : int i;
2055 : : MCVList *mcv;
2056 : 192 : Selectivity s = 0.0;
819 2057 : 192 : RangeTblEntry *rte = root->simple_rte_array[rel->relid];
2058 : :
2059 : : /* match/mismatch bitmap for each MCV item */
1845 2060 : 192 : bool *matches = NULL;
2061 : :
2062 : : /* load the MCV list stored in the statistics object */
819 2063 : 192 : mcv = statext_mcv_load(stat->statOid, rte->inh);
2064 : :
2065 : : /* build a match bitmap for the clauses */
1115 2066 : 192 : matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
2067 : : mcv, false);
2068 : :
2069 : : /* sum frequencies for all the matching MCV items */
1845 2070 : 192 : *basesel = 0.0;
2071 : 192 : *totalsel = 0.0;
2072 [ + + ]: 12552 : for (i = 0; i < mcv->nitems; i++)
2073 : : {
1844 2074 : 12360 : *totalsel += mcv->items[i].frequency;
2075 : :
1845 2076 [ + + ]: 12360 : if (matches[i] != false)
2077 : : {
1844 2078 : 231 : *basesel += mcv->items[i].base_frequency;
2079 : 231 : s += mcv->items[i].frequency;
2080 : : }
2081 : : }
2082 : :
1845 2083 : 192 : return s;
2084 : : }
2085 : :
2086 : :
2087 : : /*
2088 : : * mcv_clause_selectivity_or
2089 : : * Use MCV statistics to estimate the selectivity of a clause that
2090 : : * appears in an ORed list of clauses.
2091 : : *
2092 : : * As with mcv_clauselist_selectivity() this determines which MCV items match
2093 : : * the clause and returns both the sum of the frequencies and the sum of the
2094 : : * base frequencies of those items, as well as the sum of the frequencies of
2095 : : * all MCV items (not just the matching ones) so that this information can be
2096 : : * used by mcv_combine_selectivities() to produce a selectivity estimate that
2097 : : * makes use of both per-column and multi-column statistics.
2098 : : *
2099 : : * Additionally, we return information to help compute the overall selectivity
2100 : : * of the ORed list of clauses assumed to contain this clause. This function
2101 : : * is intended to be called for each clause in the ORed list of clauses,
2102 : : * allowing the overall selectivity to be computed using the following
2103 : : * algorithm:
2104 : : *
2105 : : * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity
2106 : : * of the first n clauses in the list. Then the combined selectivity taking
2107 : : * into account the next clause C[n+1] can be written as
2108 : : *
2109 : : * P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1])
2110 : : *
2111 : : * The final term above represents the overlap between the clauses examined so
2112 : : * far and the (n+1)'th clause. To estimate its selectivity, we track the
2113 : : * match bitmap for the ORed list of clauses examined so far and examine its
2114 : : * intersection with the match bitmap for the (n+1)'th clause.
2115 : : *
2116 : : * We then also return the sums of the MCV item frequencies and base
2117 : : * frequencies for the match bitmap intersection corresponding to the overlap
2118 : : * term above, so that they can be combined with a simple selectivity estimate
2119 : : * for that term.
2120 : : *
2121 : : * The parameter "or_matches" is an in/out parameter tracking the match bitmap
2122 : : * for the clauses examined so far. The caller is expected to set it to NULL
2123 : : * the first time it calls this function.
2124 : : */
2125 : : Selectivity
1228 dean.a.rasheed@gmail 2126 : 120 : mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
2127 : : MCVList *mcv, Node *clause, bool **or_matches,
2128 : : Selectivity *basesel, Selectivity *overlap_mcvsel,
2129 : : Selectivity *overlap_basesel, Selectivity *totalsel)
2130 : : {
2131 : 120 : Selectivity s = 0.0;
2132 : : bool *new_matches;
2133 : : int i;
2134 : :
2135 : : /* build the OR-matches bitmap, if not built already */
2136 [ + + ]: 120 : if (*or_matches == NULL)
2137 : 48 : *or_matches = palloc0(sizeof(bool) * mcv->nitems);
2138 : :
2139 : : /* build the match bitmap for the new clause */
2140 : 120 : new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
2141 : : stat->exprs, mcv, false);
2142 : :
2143 : : /*
2144 : : * Sum the frequencies for all the MCV items matching this clause and also
2145 : : * those matching the overlap between this clause and any of the preceding
2146 : : * clauses as described above.
2147 : : */
2148 : 120 : *basesel = 0.0;
2149 : 120 : *overlap_mcvsel = 0.0;
2150 : 120 : *overlap_basesel = 0.0;
2151 : 120 : *totalsel = 0.0;
2152 [ + + ]: 7758 : for (i = 0; i < mcv->nitems; i++)
2153 : : {
2154 : 7638 : *totalsel += mcv->items[i].frequency;
2155 : :
2156 [ + + ]: 7638 : if (new_matches[i])
2157 : : {
2158 : 168 : s += mcv->items[i].frequency;
2159 : 168 : *basesel += mcv->items[i].base_frequency;
2160 : :
2161 [ + + ]: 168 : if ((*or_matches)[i])
2162 : : {
2163 : 72 : *overlap_mcvsel += mcv->items[i].frequency;
2164 : 72 : *overlap_basesel += mcv->items[i].base_frequency;
2165 : : }
2166 : : }
2167 : :
2168 : : /* update the OR-matches bitmap for the next clause */
2169 [ + + + + ]: 7638 : (*or_matches)[i] = (*or_matches)[i] || new_matches[i];
2170 : : }
2171 : :
2172 : 120 : pfree(new_matches);
2173 : :
2174 : 120 : return s;
2175 : : }
|