LCOV - Differential Code Coverage HEAD vs 15 - src/backend/statistics/mcv.c

LCOV - differential code coverage report

Current view:	top level - src/backend/statistics - mcv.c (source / functions)		Coverage	Total	Hit	UNC	UIC	UBC	GIC	GNC	CBC	EUB	ECB	DUB	DCB
Current:	Differential Code Coverage HEAD vs 15	Lines:	91.6 %	619	567	3	3	46	107	9	451	3	108	3	8
Current Date:	2023-04-08 17:13:01	Functions:	81.0 %	21	17			4	3	5	9		3
Baseline:	15	Line coverage date bins:
Baseline Date:	2023-04-08 15:09:40	[..60] days:	0.0 %	3	0	3
Legend:	Lines: hit not hit	(60,120] days:	100.0 %	2	2					2
		(180,240] days:	100.0 %	7	7					7
		(240..) days:	91.9 %	607	558		3	46	107		451	3	108
		Function coverage date bins:
		(240..) days:	70.8 %	24	17			4	3	5	9		3

 Age         Owner                  TLA  Line data    Source code

                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * mcv.c
                                  4                 :  *    POSTGRES multivariate MCV lists
                                  5                 :  *
                                  6                 :  *
                                  7                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  8                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                  9                 :  *
                                 10                 :  * IDENTIFICATION
                                 11                 :  *    src/backend/statistics/mcv.c
                                 12                 :  *
                                 13                 :  *-------------------------------------------------------------------------
                                 14                 :  */
                                 15                 : #include "postgres.h"
                                 16                 : 
                                 17                 : #include <math.h>
                                 18                 : 
                                 19                 : #include "access/htup_details.h"
                                 20                 : #include "catalog/pg_collation.h"
                                 21                 : #include "catalog/pg_statistic_ext.h"
                                 22                 : #include "catalog/pg_statistic_ext_data.h"
                                 23                 : #include "fmgr.h"
                                 24                 : #include "funcapi.h"
                                 25                 : #include "nodes/nodeFuncs.h"
                                 26                 : #include "optimizer/clauses.h"
                                 27                 : #include "statistics/extended_stats_internal.h"
                                 28                 : #include "statistics/statistics.h"
                                 29                 : #include "utils/array.h"
                                 30                 : #include "utils/builtins.h"
                                 31                 : #include "utils/bytea.h"
                                 32                 : #include "utils/fmgroids.h"
                                 33                 : #include "utils/fmgrprotos.h"
                                 34                 : #include "utils/lsyscache.h"
                                 35                 : #include "utils/selfuncs.h"
                                 36                 : #include "utils/syscache.h"
                                 37                 : #include "utils/typcache.h"
                                 38                 : 
                                 39                 : /*
                                 40                 :  * Computes size of a serialized MCV item, depending on the number of
                                 41                 :  * dimensions (columns) the statistic is defined on. The datum values are
                                 42                 :  * stored in a separate array (deduplicated, to minimize the size), and
                                 43                 :  * so the serialized items only store uint16 indexes into that array.
                                 44                 :  *
                                 45                 :  * Each serialized item stores (in this order):
                                 46                 :  *
                                 47                 :  * - indexes to values    (ndim * sizeof(uint16))
                                 48                 :  * - null flags           (ndim * sizeof(bool))
                                 49                 :  * - frequency            (sizeof(double))
                                 50                 :  * - base_frequency       (sizeof(double))
                                 51                 :  *
                                 52                 :  * There is no alignment padding within an MCV item.
                                 53                 :  * So in total each MCV item requires this many bytes:
                                 54                 :  *
                                 55                 :  *   ndim * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double)
                                 56                 :  */
                                 57                 : #define ITEM_SIZE(ndims)    \
                                 58                 :     ((ndims) * (sizeof(uint16) + sizeof(bool)) + 2 * sizeof(double))
                                 59                 : 
                                 60                 : /*
                                 61                 :  * Used to compute size of serialized MCV list representation.
                                 62                 :  */
                                 63                 : #define MinSizeOfMCVList        \
                                 64                 :     (VARHDRSZ + sizeof(uint32) * 3 + sizeof(AttrNumber))
                                 65                 : 
                                 66                 : /*
                                 67                 :  * Size of the serialized MCV list, excluding the space needed for
                                 68                 :  * deduplicated per-dimension values. The macro is meant to be used
                                 69                 :  * when it's not yet safe to access the serialized info about amount
                                 70                 :  * of data for each column.
                                 71                 :  */
                                 72                 : #define SizeOfMCVList(ndims,nitems) \
                                 73                 :     ((MinSizeOfMCVList + sizeof(Oid) * (ndims)) + \
                                 74                 :      ((ndims) * sizeof(DimensionInfo)) + \
                                 75                 :      ((nitems) * ITEM_SIZE(ndims)))
                                 76                 : 
                                 77                 : static MultiSortSupport build_mss(StatsBuildData *data);
                                 78                 : 
                                 79                 : static SortItem *build_distinct_groups(int numrows, SortItem *items,
                                 80                 :                                        MultiSortSupport mss, int *ndistinct);
                                 81                 : 
                                 82                 : static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
                                 83                 :                                            MultiSortSupport mss, int *ncounts);
                                 84                 : 
                                 85                 : static int  count_distinct_groups(int numrows, SortItem *items,
                                 86                 :                                   MultiSortSupport mss);
                                 87                 : 
                                 88                 : /*
                                 89                 :  * Compute new value for bitmap item, considering whether it's used for
                                 90                 :  * clauses connected by AND/OR.
                                 91                 :  */
                                 92                 : #define RESULT_MERGE(value, is_or, match) \
                                 93                 :     ((is_or) ? ((value) || (match)) : ((value) && (match)))
                                 94                 : 
                                 95                 : /*
                                 96                 :  * When processing a list of clauses, the bitmap item may get set to a value
                                 97                 :  * such that additional clauses can't change it. For example, when processing
                                 98                 :  * a list of clauses connected to AND, as soon as the item gets set to 'false'
                                 99                 :  * then it'll remain like that. Similarly clauses connected by OR and 'true'.
                                100                 :  *
                                101                 :  * Returns true when the value in the bitmap can't change no matter how the
                                102                 :  * remaining clauses are evaluated.
                                103                 :  */
                                104                 : #define RESULT_IS_FINAL(value, is_or)   ((is_or) ? (value) : (!(value)))
                                105                 : 
                                106                 : /*
                                107                 :  * get_mincount_for_mcv_list
                                108                 :  *      Determine the minimum number of times a value needs to appear in
                                109                 :  *      the sample for it to be included in the MCV list.
                                110                 :  *
                                111                 :  * We want to keep only values that appear sufficiently often in the
                                112                 :  * sample that it is reasonable to extrapolate their sample frequencies to
                                113                 :  * the entire table.  We do this by placing an upper bound on the relative
                                114                 :  * standard error of the sample frequency, so that any estimates the
                                115                 :  * planner generates from the MCV statistics can be expected to be
                                116                 :  * reasonably accurate.
                                117                 :  *
                                118                 :  * Since we are sampling without replacement, the sample frequency of a
                                119                 :  * particular value is described by a hypergeometric distribution.  A
                                120                 :  * common rule of thumb when estimating errors in this situation is to
                                121                 :  * require at least 10 instances of the value in the sample, in which case
                                122                 :  * the distribution can be approximated by a normal distribution, and
                                123                 :  * standard error analysis techniques can be applied.  Given a sample size
                                124                 :  * of n, a population size of N, and a sample frequency of p=cnt/n, the
                                125                 :  * standard error of the proportion p is given by
                                126                 :  *      SE = sqrt(p*(1-p)/n) * sqrt((N-n)/(N-1))
                                127                 :  * where the second term is the finite population correction.  To get
                                128                 :  * reasonably accurate planner estimates, we impose an upper bound on the
                                129                 :  * relative standard error of 20% -- i.e., SE/p < 0.2.  This 20% relative
                                130                 :  * error bound is fairly arbitrary, but has been found empirically to work
                                131                 :  * well.  Rearranging this formula gives a lower bound on the number of
                                132                 :  * instances of the value seen:
                                133                 :  *      cnt > n*(N-n) / (N-n+0.04*n*(N-1))
                                134                 :  * This bound is at most 25, and approaches 0 as n approaches 0 or N. The
                                135                 :  * case where n approaches 0 cannot happen in practice, since the sample
                                136                 :  * size is at least 300.  The case where n approaches N corresponds to
                                137                 :  * sampling the whole table, in which case it is reasonable to keep
                                138                 :  * the whole MCV list (have no lower bound), so it makes sense to apply
                                139                 :  * this formula for all inputs, even though the above derivation is
                                140                 :  * technically only valid when the right hand side is at least around 10.
                                141                 :  *
                                142                 :  * An alternative way to look at this formula is as follows -- assume that
                                143                 :  * the number of instances of the value seen scales up to the entire
                                144                 :  * table, so that the population count is K=N*cnt/n. Then the distribution
                                145                 :  * in the sample is a hypergeometric distribution parameterised by N, n
                                146                 :  * and K, and the bound above is mathematically equivalent to demanding
                                147                 :  * that the standard deviation of that distribution is less than 20% of
                                148                 :  * its mean.  Thus the relative errors in any planner estimates produced
                                149                 :  * from the MCV statistics are likely to be not too large.
                                150                 :  */
                                151                 : static double
 1474 tomas.vondra              152 CBC          90 : get_mincount_for_mcv_list(int samplerows, double totalrows)
                                153                 : {
                                154              90 :     double      n = samplerows;
                                155              90 :     double      N = totalrows;
                                156                 :     double      numer,
                                157                 :                 denom;
                                158                 : 
                                159              90 :     numer = n * (N - n);
                                160              90 :     denom = N - n + 0.04 * n * (N - 1);
                                161                 : 
                                162                 :     /* Guard against division by zero (possible if n = N = 1) */
                                163              90 :     if (denom == 0.0)
                                164               6 :         return 0.0;
                                165                 : 
                                166              84 :     return numer / denom;
                                167                 : }
                                168                 : 
                                169                 : /*
                                170                 :  * Builds MCV list from the set of sampled rows.
                                171                 :  *
                                172                 :  * The algorithm is quite simple:
                                173                 :  *
                                174                 :  *     (1) sort the data (default collation, '<' for the data type)
                                175                 :  *
                                176                 :  *     (2) count distinct groups, decide how many to keep
                                177                 :  *
                                178                 :  *     (3) build the MCV list using the threshold determined in (2)
                                179                 :  *
                                180                 :  *     (4) remove rows represented by the MCV from the sample
                                181                 :  *
                                182                 :  */
                                183                 : MCVList *
  744                           184              90 : statext_mcv_build(StatsBuildData *data, double totalrows, int stattarget)
                                185                 : {
                                186                 :     int         i,
                                187                 :                 numattrs,
                                188                 :                 numrows,
                                189                 :                 ngroups,
                                190                 :                 nitems;
                                191                 :     double      mincount;
                                192                 :     SortItem   *items;
                                193                 :     SortItem   *groups;
 1474                           194              90 :     MCVList    *mcvlist = NULL;
                                195                 :     MultiSortSupport mss;
                                196                 : 
                                197                 :     /* comparator for all the columns */
  744                           198              90 :     mss = build_mss(data);
                                199                 : 
                                200                 :     /* sort the rows */
                                201              90 :     items = build_sorted_items(data, &nitems, mss,
                                202                 :                                data->nattnums, data->attnums);
                                203                 : 
 1474                           204              90 :     if (!items)
 1474 tomas.vondra              205 UBC           0 :         return NULL;
                                206                 : 
                                207                 :     /* for convenience */
  744 tomas.vondra              208 CBC          90 :     numattrs = data->nattnums;
                                209              90 :     numrows = data->numrows;
                                210                 : 
                                211                 :     /* transform the sorted rows into groups (sorted by frequency) */
 1474                           212              90 :     groups = build_distinct_groups(nitems, items, mss, &ngroups);
                                213                 : 
                                214                 :     /*
                                215                 :      * The maximum number of MCV items to store, based on the statistics
                                216                 :      * target we computed for the statistics object (from the target set for
                                217                 :      * the object itself, attributes and the system default). In any case, we
                                218                 :      * can't keep more groups than we have available.
                                219                 :      */
 1307                           220              90 :     nitems = stattarget;
 1474                           221              90 :     if (nitems > ngroups)
                                222              54 :         nitems = ngroups;
                                223                 : 
                                224                 :     /*
                                225                 :      * Decide how many items to keep in the MCV list. We can't use the same
                                226                 :      * algorithm as per-column MCV lists, because that only considers the
                                227                 :      * actual group frequency - but we're primarily interested in how the
                                228                 :      * actual frequency differs from the base frequency (product of simple
                                229                 :      * per-column frequencies, as if the columns were independent).
                                230                 :      *
                                231                 :      * Using the same algorithm might exclude items that are close to the
                                232                 :      * "average" frequency of the sample. But that does not say whether the
                                233                 :      * observed frequency is close to the base frequency or not. We also need
                                234                 :      * to consider unexpectedly uncommon items (again, compared to the base
                                235                 :      * frequency), and the single-column algorithm does not have to.
                                236                 :      *
                                237                 :      * We simply decide how many items to keep by computing the minimum count
                                238                 :      * using get_mincount_for_mcv_list() and then keep all items that seem to
                                239                 :      * be more common than that.
                                240                 :      */
                                241              90 :     mincount = get_mincount_for_mcv_list(numrows, totalrows);
                                242                 : 
                                243                 :     /*
                                244                 :      * Walk the groups until we find the first group with a count below the
                                245                 :      * mincount threshold (the index of that group is the number of groups we
                                246                 :      * want to keep).
                                247                 :      */
                                248            4386 :     for (i = 0; i < nitems; i++)
                                249                 :     {
                                250            4296 :         if (groups[i].count < mincount)
                                251                 :         {
 1474 tomas.vondra              252 UBC           0 :             nitems = i;
                                253               0 :             break;
                                254                 :         }
                                255                 :     }
                                256                 : 
                                257                 :     /*
                                258                 :      * At this point, we know the number of items for the MCV list. There
                                259                 :      * might be none (for uniform distribution with many groups), and in that
                                260                 :      * case, there will be no MCV list. Otherwise, construct the MCV list.
                                261                 :      */
 1474 tomas.vondra              262 CBC          90 :     if (nitems > 0)
                                263                 :     {
                                264                 :         int         j;
                                265                 :         SortItem    key;
                                266                 :         MultiSortSupport tmp;
                                267                 : 
                                268                 :         /* frequencies for values in each attribute */
                                269                 :         SortItem  **freqs;
                                270                 :         int        *nfreqs;
                                271                 : 
                                272                 :         /* used to search values */
 1375                           273              90 :         tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
                                274                 :                                         + sizeof(SortSupportData));
                                275                 : 
                                276                 :         /* compute frequencies for values in each column */
                                277              90 :         nfreqs = (int *) palloc0(sizeof(int) * numattrs);
                                278              90 :         freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
                                279                 : 
                                280                 :         /*
                                281                 :          * Allocate the MCV list structure, set the global parameters.
                                282                 :          */
 1473                           283              90 :         mcvlist = (MCVList *) palloc0(offsetof(MCVList, items) +
                                284              90 :                                       sizeof(MCVItem) * nitems);
                                285                 : 
 1474                           286              90 :         mcvlist->magic = STATS_MCV_MAGIC;
                                287              90 :         mcvlist->type = STATS_MCV_TYPE_BASIC;
                                288              90 :         mcvlist->ndimensions = numattrs;
                                289              90 :         mcvlist->nitems = nitems;
                                290                 : 
                                291                 :         /* store info about data type OIDs */
                                292             345 :         for (i = 0; i < numattrs; i++)
  744                           293             255 :             mcvlist->types[i] = data->stats[i]->attrtypid;
                                294                 : 
                                295                 :         /* Copy the first chunk of groups into the result. */
 1474                           296            4386 :         for (i = 0; i < nitems; i++)
                                297                 :         {
                                298                 :             /* just point to the proper place in the list */
 1473                           299            4296 :             MCVItem    *item = &mcvlist->items[i];
                                300                 : 
                                301            4296 :             item->values = (Datum *) palloc(sizeof(Datum) * numattrs);
                                302            4296 :             item->isnull = (bool *) palloc(sizeof(bool) * numattrs);
                                303                 : 
                                304                 :             /* copy values for the group */
 1474                           305            4296 :             memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
                                306            4296 :             memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
                                307                 : 
                                308                 :             /* groups should be sorted by frequency in descending order */
                                309            4296 :             Assert((i == 0) || (groups[i - 1].count >= groups[i].count));
                                310                 : 
                                311                 :             /* group frequency */
                                312            4296 :             item->frequency = (double) groups[i].count / numrows;
                                313                 : 
                                314                 :             /* base frequency, if the attributes were independent */
                                315            4296 :             item->base_frequency = 1.0;
                                316           16425 :             for (j = 0; j < numattrs; j++)
                                317                 :             {
                                318                 :                 SortItem   *freq;
                                319                 : 
                                320                 :                 /* single dimension */
 1375                           321           12129 :                 tmp->ndims = 1;
                                322           12129 :                 tmp->ssup[0] = mss->ssup[j];
                                323                 : 
                                324                 :                 /* fill search key */
                                325           12129 :                 key.values = &groups[i].values[j];
                                326           12129 :                 key.isnull = &groups[i].isnull[j];
                                327                 : 
                                328           12129 :                 freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
                                329                 :                                                 sizeof(SortItem),
                                330                 :                                                 multi_sort_compare, tmp);
                                331                 : 
                                332           12129 :                 item->base_frequency *= ((double) freq->count) / numrows;
                                333                 :             }
                                334                 :         }
                                335                 : 
                                336              90 :         pfree(nfreqs);
                                337              90 :         pfree(freqs);
                                338                 :     }
                                339                 : 
 1474                           340              90 :     pfree(items);
                                341              90 :     pfree(groups);
                                342                 : 
                                343              90 :     return mcvlist;
                                344                 : }
                                345                 : 
                                346                 : /*
                                347                 :  * build_mss
                                348                 :  *      Build a MultiSortSupport for the given StatsBuildData.
                                349                 :  */
                                350                 : static MultiSortSupport
  744                           351              90 : build_mss(StatsBuildData *data)
                                352                 : {
                                353                 :     int         i;
                                354              90 :     int         numattrs = data->nattnums;
                                355                 : 
                                356                 :     /* Sort by multiple columns (using array of SortSupport) */
 1474                           357              90 :     MultiSortSupport mss = multi_sort_init(numattrs);
                                358                 : 
                                359                 :     /* prepare the sort functions for all the attributes */
                                360             345 :     for (i = 0; i < numattrs; i++)
                                361                 :     {
  744                           362             255 :         VacAttrStats *colstat = data->stats[i];
                                363                 :         TypeCacheEntry *type;
                                364                 : 
 1474                           365             255 :         type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
                                366             255 :         if (type->lt_opr == InvalidOid) /* shouldn't happen */
 1474 tomas.vondra              367 UBC           0 :             elog(ERROR, "cache lookup failed for ordering operator for type %u",
                                368                 :                  colstat->attrtypid);
                                369                 : 
 1361 tomas.vondra              370 CBC         255 :         multi_sort_add_dimension(mss, i, type->lt_opr, colstat->attrcollid);
                                371                 :     }
                                372                 : 
 1474                           373              90 :     return mss;
                                374                 : }
                                375                 : 
                                376                 : /*
                                377                 :  * count_distinct_groups
                                378                 :  *      Count distinct combinations of SortItems in the array.
                                379                 :  *
                                380                 :  * The array is assumed to be sorted according to the MultiSortSupport.
                                381                 :  */
                                382                 : static int
                                383              90 : count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
                                384                 : {
                                385                 :     int         i;
                                386                 :     int         ndistinct;
                                387                 : 
                                388              90 :     ndistinct = 1;
                                389          229509 :     for (i = 1; i < numrows; i++)
                                390                 :     {
                                391                 :         /* make sure the array really is sorted */
                                392          229419 :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
                                393                 : 
                                394          229419 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
                                395           38856 :             ndistinct += 1;
                                396                 :     }
                                397                 : 
                                398              90 :     return ndistinct;
                                399                 : }
                                400                 : 
                                401                 : /*
                                402                 :  * compare_sort_item_count
                                403                 :  *      Comparator for sorting items by count (frequencies) in descending
                                404                 :  *      order.
                                405                 :  */
                                406                 : static int
  271 tgl                       407           39378 : compare_sort_item_count(const void *a, const void *b, void *arg)
                                408                 : {
 1474 tomas.vondra              409           39378 :     SortItem   *ia = (SortItem *) a;
                                410           39378 :     SortItem   *ib = (SortItem *) b;
                                411                 : 
                                412           39378 :     if (ia->count == ib->count)
                                413           38904 :         return 0;
                                414             474 :     else if (ia->count > ib->count)
                                415             315 :         return -1;
                                416                 : 
                                417             159 :     return 1;
                                418                 : }
                                419                 : 
                                420                 : /*
                                421                 :  * build_distinct_groups
                                422                 :  *      Build an array of SortItems for distinct groups and counts matching
                                423                 :  *      items.
                                424                 :  *
                                425                 :  * The 'items' array is assumed to be sorted.
                                426                 :  */
                                427                 : static SortItem *
                                428              90 : build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
                                429                 :                       int *ndistinct)
                                430                 : {
                                431                 :     int         i,
                                432                 :                 j;
                                433              90 :     int         ngroups = count_distinct_groups(numrows, items, mss);
                                434                 : 
                                435              90 :     SortItem   *groups = (SortItem *) palloc(ngroups * sizeof(SortItem));
                                436                 : 
                                437              90 :     j = 0;
                                438              90 :     groups[0] = items[0];
                                439              90 :     groups[0].count = 1;
                                440                 : 
                                441          229509 :     for (i = 1; i < numrows; i++)
                                442                 :     {
                                443                 :         /* Assume sorted in ascending order. */
                                444          229419 :         Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
                                445                 : 
                                446                 :         /* New distinct group detected. */
                                447          229419 :         if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
                                448                 :         {
                                449           38856 :             groups[++j] = items[i];
                                450           38856 :             groups[j].count = 0;
                                451                 :         }
                                452                 : 
                                453          229419 :         groups[j].count++;
                                454                 :     }
                                455                 : 
                                456                 :     /* ensure we filled the expected number of distinct groups */
                                457              90 :     Assert(j + 1 == ngroups);
                                458                 : 
                                459                 :     /* Sort the distinct groups by frequency (in descending order). */
   61 peter                     460 GNC          90 :     qsort_interruptible(groups, ngroups, sizeof(SortItem),
                                461                 :                         compare_sort_item_count, NULL);
                                462                 : 
 1474 tomas.vondra              463 CBC          90 :     *ndistinct = ngroups;
                                464              90 :     return groups;
                                465                 : }
                                466                 : 
                                467                 : /* compare sort items (single dimension) */
                                468                 : static int
 1375                           469          417045 : sort_item_compare(const void *a, const void *b, void *arg)
                                470                 : {
 1060 tgl                       471          417045 :     SortSupport ssup = (SortSupport) arg;
 1375 tomas.vondra              472          417045 :     SortItem   *ia = (SortItem *) a;
                                473          417045 :     SortItem   *ib = (SortItem *) b;
                                474                 : 
                                475          834090 :     return ApplySortComparator(ia->values[0], ia->isnull[0],
                                476          417045 :                                ib->values[0], ib->isnull[0],
                                477                 :                                ssup);
                                478                 : }
                                479                 : 
                                480                 : /*
                                481                 :  * build_column_frequencies
                                482                 :  *      Compute frequencies of values in each column.
                                483                 :  *
                                484                 :  * This returns an array of SortItems for each attribute the MCV is built
                                485                 :  * on, with a frequency (number of occurrences) for each value. This is
                                486                 :  * then used to compute "base" frequency of MCV items.
                                487                 :  *
                                488                 :  * All the memory is allocated in a single chunk, so that a single pfree
                                489                 :  * is enough to release it. We do not allocate space for values/isnull
                                490                 :  * arrays in the SortItems, because we can simply point into the input
                                491                 :  * groups directly.
                                492                 :  */
                                493                 : static SortItem **
                                494              90 : build_column_frequencies(SortItem *groups, int ngroups,
                                495                 :                          MultiSortSupport mss, int *ncounts)
                                496                 : {
                                497                 :     int         i,
                                498                 :                 dim;
                                499                 :     SortItem  **result;
                                500                 :     char       *ptr;
                                501                 : 
                                502              90 :     Assert(groups);
                                503              90 :     Assert(ncounts);
                                504                 : 
                                505                 :     /* allocate arrays for all columns as a single chunk */
                                506              90 :     ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
 1060 tgl                       507              90 :                  mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
                                508                 : 
                                509                 :     /* initial array of pointers */
 1375 tomas.vondra              510              90 :     result = (SortItem **) ptr;
                                511              90 :     ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
                                512                 : 
                                513             345 :     for (dim = 0; dim < mss->ndims; dim++)
                                514                 :     {
 1060 tgl                       515             255 :         SortSupport ssup = &mss->ssup[dim];
                                516                 : 
                                517                 :         /* array of values for a single column */
 1375 tomas.vondra              518             255 :         result[dim] = (SortItem *) ptr;
                                519             255 :         ptr += MAXALIGN(sizeof(SortItem) * ngroups);
                                520                 : 
                                521                 :         /* extract data for the dimension */
                                522          113484 :         for (i = 0; i < ngroups; i++)
                                523                 :         {
                                524                 :             /* point into the input groups */
                                525          113229 :             result[dim][i].values = &groups[i].values[dim];
                                526          113229 :             result[dim][i].isnull = &groups[i].isnull[dim];
                                527          113229 :             result[dim][i].count = groups[i].count;
                                528                 :         }
                                529                 : 
                                530                 :         /* sort the values, deduplicate */
   61 peter                     531 GNC         255 :         qsort_interruptible(result[dim], ngroups, sizeof(SortItem),
                                532                 :                             sort_item_compare, ssup);
                                533                 : 
                                534                 :         /*
                                535                 :          * Identify distinct values, compute frequency (there might be
                                536                 :          * multiple MCV items containing this value, so we need to sum counts
                                537                 :          * from all of them.
                                538                 :          */
 1375 tomas.vondra              539 CBC         255 :         ncounts[dim] = 1;
                                540          113229 :         for (i = 1; i < ngroups; i++)
                                541                 :         {
 1060 tgl                       542          112974 :             if (sort_item_compare(&result[dim][i - 1], &result[dim][i], ssup) == 0)
                                543                 :             {
                                544           68091 :                 result[dim][ncounts[dim] - 1].count += result[dim][i].count;
 1375 tomas.vondra              545           68091 :                 continue;
                                546                 :             }
                                547                 : 
                                548           44883 :             result[dim][ncounts[dim]] = result[dim][i];
                                549                 : 
                                550           44883 :             ncounts[dim]++;
                                551                 :         }
                                552                 :     }
                                553                 : 
                                554              90 :     return result;
                                555                 : }
                                556                 : 
                                557                 : /*
                                558                 :  * statext_mcv_load
                                559                 :  *      Load the MCV list for the indicated pg_statistic_ext_data tuple.
                                560                 :  */
                                561                 : MCVList *
  448                           562             240 : statext_mcv_load(Oid mvoid, bool inh)
                                563                 : {
                                564                 :     MCVList    *result;
                                565                 :     bool        isnull;
                                566                 :     Datum       mcvlist;
                                567             240 :     HeapTuple   htup = SearchSysCache2(STATEXTDATASTXOID,
                                568                 :                                        ObjectIdGetDatum(mvoid), BoolGetDatum(inh));
                                569                 : 
 1474                           570             240 :     if (!HeapTupleIsValid(htup))
 1474 tomas.vondra              571 UBC           0 :         elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
                                572                 : 
 1396 tomas.vondra              573 CBC         240 :     mcvlist = SysCacheGetAttr(STATEXTDATASTXOID, htup,
                                574                 :                               Anum_pg_statistic_ext_data_stxdmcv, &isnull);
                                575                 : 
 1474                           576             240 :     if (isnull)
 1473 tomas.vondra              577 UBC           0 :         elog(ERROR,
                                578                 :              "requested statistics kind \"%c\" is not yet built for statistics object %u",
                                579                 :              STATS_EXT_DEPENDENCIES, mvoid);
                                580                 : 
 1473 tomas.vondra              581 CBC         240 :     result = statext_mcv_deserialize(DatumGetByteaP(mcvlist));
                                582                 : 
                                583             240 :     ReleaseSysCache(htup);
                                584                 : 
                                585             240 :     return result;
                                586                 : }
                                587                 : 
                                588                 : 
                                589                 : /*
                                590                 :  * statext_mcv_serialize
                                591                 :  *      Serialize MCV list into a pg_mcv_list value.
                                592                 :  *
                                593                 :  * The MCV items may include values of various data types, and it's reasonable
                                594                 :  * to expect redundancy (values for a given attribute, repeated for multiple
                                595                 :  * MCV list items). So we deduplicate the values into arrays, and then replace
                                596                 :  * the values by indexes into those arrays.
                                597                 :  *
                                598                 :  * The overall structure of the serialized representation looks like this:
                                599                 :  *
                                600                 :  * +---------------+----------------+---------------------+-------+
                                601                 :  * | header fields | dimension info | deduplicated values | items |
                                602                 :  * +---------------+----------------+---------------------+-------+
                                603                 :  *
                                604                 :  * Where dimension info stores information about the type of the K-th
                                605                 :  * attribute (e.g. typlen, typbyval and length of deduplicated values).
                                606                 :  * Deduplicated values store deduplicated values for each attribute.  And
                                607                 :  * items store the actual MCV list items, with values replaced by indexes into
                                608                 :  * the arrays.
                                609                 :  *
                                610                 :  * When serializing the items, we use uint16 indexes. The number of MCV items
                                611                 :  * is limited by the statistics target (which is capped to 10k at the moment).
                                612                 :  * We might increase this to 65k and still fit into uint16, so there's a bit of
                                613                 :  * slack. Furthermore, this limit is on the number of distinct values per column,
                                614                 :  * and we usually have few of those (and various combinations of them for the
                                615                 :  * those MCV list). So uint16 seems fine for now.
                                616                 :  *
                                617                 :  * We don't really expect the serialization to save as much space as for
                                618                 :  * histograms, as we are not doing any bucket splits (which is the source
                                619                 :  * of high redundancy in histograms).
                                620                 :  *
                                621                 :  * TODO: Consider packing boolean flags (NULL) for each item into a single char
                                622                 :  * (or a longer type) instead of using an array of bool items.
                                623                 :  */
                                624                 : bytea *
 1418 tgl                       625              90 : statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
                                626                 : {
                                627                 :     int         i;
                                628                 :     int         dim;
 1474 tomas.vondra              629              90 :     int         ndims = mcvlist->ndimensions;
                                630                 : 
                                631                 :     SortSupport ssup;
                                632                 :     DimensionInfo *info;
                                633                 : 
                                634                 :     Size        total_length;
                                635                 : 
                                636                 :     /* serialized items (indexes into arrays, etc.) */
                                637                 :     bytea      *raw;
                                638                 :     char       *ptr;
                                639                 :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
                                640                 : 
                                641                 :     /* values per dimension (and number of non-NULL values) */
                                642              90 :     Datum     **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
                                643              90 :     int        *counts = (int *) palloc0(sizeof(int) * ndims);
                                644                 : 
                                645                 :     /*
                                646                 :      * We'll include some rudimentary information about the attribute types
                                647                 :      * (length, by-val flag), so that we don't have to look them up while
                                648                 :      * deserializing the MCV list (we already have the type OID in the
                                649                 :      * header).  This is safe because when changing the type of the attribute
                                650                 :      * the statistics gets dropped automatically.  We need to store the info
                                651                 :      * about the arrays of deduplicated values anyway.
                                652                 :      */
                                653              90 :     info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
                                654                 : 
                                655                 :     /* sort support data for all attributes included in the MCV list */
                                656              90 :     ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
                                657                 : 
                                658                 :     /* collect and deduplicate values for each dimension (attribute) */
                                659             345 :     for (dim = 0; dim < ndims; dim++)
                                660                 :     {
                                661                 :         int         ndistinct;
                                662                 :         TypeCacheEntry *typentry;
                                663                 : 
                                664                 :         /*
                                665                 :          * Lookup the LT operator (can't get it from stats extra_data, as we
                                666                 :          * don't know how to interpret that - scalar vs. array etc.).
                                667                 :          */
                                668             255 :         typentry = lookup_type_cache(stats[dim]->attrtypid, TYPECACHE_LT_OPR);
                                669                 : 
                                670                 :         /* copy important info about the data type (length, by-value) */
                                671             255 :         info[dim].typlen = stats[dim]->attrtype->typlen;
                                672             255 :         info[dim].typbyval = stats[dim]->attrtype->typbyval;
                                673                 : 
                                674                 :         /* allocate space for values in the attribute and collect them */
                                675             255 :         values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
                                676                 : 
                                677           12384 :         for (i = 0; i < mcvlist->nitems; i++)
                                678                 :         {
                                679                 :             /* skip NULL values - we don't need to deduplicate those */
 1473                           680           12129 :             if (mcvlist->items[i].isnull[dim])
 1474                           681              33 :                 continue;
                                682                 : 
                                683                 :             /* append the value at the end */
 1473                           684           12096 :             values[dim][counts[dim]] = mcvlist->items[i].values[dim];
 1474                           685           12096 :             counts[dim] += 1;
                                686                 :         }
                                687                 : 
                                688                 :         /* if there are just NULL values in this dimension, we're done */
                                689             255 :         if (counts[dim] == 0)
                                690               3 :             continue;
                                691                 : 
                                692                 :         /* sort and deduplicate the data */
                                693             252 :         ssup[dim].ssup_cxt = CurrentMemoryContext;
 1361                           694             252 :         ssup[dim].ssup_collation = stats[dim]->attrcollid;
 1474                           695             252 :         ssup[dim].ssup_nulls_first = false;
                                696                 : 
                                697             252 :         PrepareSortSupportFromOrderingOp(typentry->lt_opr, &ssup[dim]);
                                698                 : 
  271 tgl                       699             252 :         qsort_interruptible(values[dim], counts[dim], sizeof(Datum),
                                700             252 :                             compare_scalars_simple, &ssup[dim]);
                                701                 : 
                                702                 :         /*
                                703                 :          * Walk through the array and eliminate duplicate values, but keep the
                                704                 :          * ordering (so that we can do a binary search later). We know there's
                                705                 :          * at least one item as (counts[dim] != 0), so we can skip the first
                                706                 :          * element.
                                707                 :          */
 1474 tomas.vondra              708             252 :         ndistinct = 1;          /* number of distinct values */
                                709           12096 :         for (i = 1; i < counts[dim]; i++)
                                710                 :         {
                                711                 :             /* expect sorted array */
                                712           11844 :             Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
                                713                 : 
                                714                 :             /* if the value is the same as the previous one, we can skip it */
                                715           11844 :             if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
                                716            4977 :                 continue;
                                717                 : 
                                718            6867 :             values[dim][ndistinct] = values[dim][i];
                                719            6867 :             ndistinct += 1;
                                720                 :         }
                                721                 : 
                                722                 :         /* we must not exceed PG_UINT16_MAX, as we use uint16 indexes */
                                723             252 :         Assert(ndistinct <= PG_UINT16_MAX);
                                724                 : 
                                725                 :         /*
                                726                 :          * Store additional info about the attribute - number of deduplicated
                                727                 :          * values, and also size of the serialized data. For fixed-length data
                                728                 :          * types this is trivial to compute, for varwidth types we need to
                                729                 :          * actually walk the array and sum the sizes.
                                730                 :          */
                                731             252 :         info[dim].nvalues = ndistinct;
                                732                 : 
 1060 tgl                       733             252 :         if (info[dim].typbyval) /* by-value data types */
                                734                 :         {
 1374 tomas.vondra              735             186 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
                                736                 : 
                                737                 :             /*
                                738                 :              * We copy the data into the MCV item during deserialization, so
                                739                 :              * we don't need to allocate any extra space.
                                740                 :              */
                                741             186 :             info[dim].nbytes_aligned = 0;
                                742                 :         }
 1060 tgl                       743              66 :         else if (info[dim].typlen > 0)   /* fixed-length by-ref */
                                744                 :         {
                                745                 :             /*
                                746                 :              * We don't care about alignment in the serialized data, so we
                                747                 :              * pack the data as much as possible. But we also track how much
                                748                 :              * data will be needed after deserialization, and in that case we
                                749                 :              * need to account for alignment of each item.
                                750                 :              *
                                751                 :              * Note: As the items are fixed-length, we could easily compute
                                752                 :              * this during deserialization, but we do it here anyway.
                                753                 :              */
 1474 tomas.vondra              754              12 :             info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
 1374                           755              12 :             info[dim].nbytes_aligned = info[dim].nvalues * MAXALIGN(info[dim].typlen);
                                756                 :         }
 1474                           757              54 :         else if (info[dim].typlen == -1)    /* varlena */
                                758                 :         {
                                759              54 :             info[dim].nbytes = 0;
 1374                           760              54 :             info[dim].nbytes_aligned = 0;
 1474                           761            1437 :             for (i = 0; i < info[dim].nvalues; i++)
                                762                 :             {
                                763                 :                 Size        len;
                                764                 : 
                                765                 :                 /*
                                766                 :                  * For varlena values, we detoast the values and store the
                                767                 :                  * length and data separately. We don't bother with alignment
                                768                 :                  * here, which means that during deserialization we need to
                                769                 :                  * copy the fields and only access the copies.
                                770                 :                  */
                                771            1383 :                 values[dim][i] = PointerGetDatum(PG_DETOAST_DATUM(values[dim][i]));
                                772                 : 
                                773                 :                 /* serialized length (uint32 length + data) */
 1374                           774            1383 :                 len = VARSIZE_ANY_EXHDR(values[dim][i]);
 1060 tgl                       775            1383 :                 info[dim].nbytes += sizeof(uint32); /* length */
                                776            1383 :                 info[dim].nbytes += len;    /* value (no header) */
                                777                 : 
                                778                 :                 /*
                                779                 :                  * During deserialization we'll build regular varlena values
                                780                 :                  * with full headers, and we need to align them properly.
                                781                 :                  */
 1374 tomas.vondra              782            1383 :                 info[dim].nbytes_aligned += MAXALIGN(VARHDRSZ + len);
                                783                 :             }
                                784                 :         }
 1474 tomas.vondra              785 UBC           0 :         else if (info[dim].typlen == -2)    /* cstring */
                                786                 :         {
                                787               0 :             info[dim].nbytes = 0;
 1374                           788               0 :             info[dim].nbytes_aligned = 0;
 1474                           789               0 :             for (i = 0; i < info[dim].nvalues; i++)
                                790                 :             {
                                791                 :                 Size        len;
                                792                 : 
                                793                 :                 /*
                                794                 :                  * cstring is handled similar to varlena - first we store the
                                795                 :                  * length as uint32 and then the data. We don't care about
                                796                 :                  * alignment, which means that during deserialization we need
                                797                 :                  * to copy the fields and only access the copies.
                                798                 :                  */
                                799                 : 
                                800                 :                 /* c-strings include terminator, so +1 byte */
 1472                           801               0 :                 len = strlen(DatumGetCString(values[dim][i])) + 1;
 1060 tgl                       802               0 :                 info[dim].nbytes += sizeof(uint32); /* length */
                                803               0 :                 info[dim].nbytes += len;    /* value */
                                804                 : 
                                805                 :                 /* space needed for properly aligned deserialized copies */
 1374 tomas.vondra              806               0 :                 info[dim].nbytes_aligned += MAXALIGN(len);
                                807                 :             }
                                808                 :         }
                                809                 : 
                                810                 :         /* we know (count>0) so there must be some data */
 1474 tomas.vondra              811 CBC         252 :         Assert(info[dim].nbytes > 0);
                                812                 :     }
                                813                 : 
                                814                 :     /*
                                815                 :      * Now we can finally compute how much space we'll actually need for the
                                816                 :      * whole serialized MCV list (varlena header, MCV header, dimension info
                                817                 :      * for each attribute, deduplicated values and items).
                                818                 :      */
 1060 tgl                       819              90 :     total_length = (3 * sizeof(uint32)) /* magic + type + nitems */
                                820                 :         + sizeof(AttrNumber)    /* ndimensions */
                                821              90 :         + (ndims * sizeof(Oid));    /* attribute types */
                                822                 : 
                                823                 :     /* dimension info */
 1374 tomas.vondra              824              90 :     total_length += ndims * sizeof(DimensionInfo);
                                825                 : 
                                826                 :     /* add space for the arrays of deduplicated values */
 1474                           827             345 :     for (i = 0; i < ndims; i++)
 1374                           828             255 :         total_length += info[i].nbytes;
                                829                 : 
                                830                 :     /*
                                831                 :      * And finally account for the items (those are fixed-length, thanks to
                                832                 :      * replacing values with uint16 indexes into the deduplicated arrays).
                                833                 :      */
                                834              90 :     total_length += mcvlist->nitems * ITEM_SIZE(dim);
                                835                 : 
                                836                 :     /*
                                837                 :      * Allocate space for the whole serialized MCV list (we'll skip bytes, so
                                838                 :      * we set them to zero to make the result more compressible).
                                839                 :      */
                                840              90 :     raw = (bytea *) palloc0(VARHDRSZ + total_length);
                                841              90 :     SET_VARSIZE(raw, VARHDRSZ + total_length);
                                842                 : 
 1467                           843              90 :     ptr = VARDATA(raw);
 1374                           844              90 :     endptr = ptr + total_length;
                                845                 : 
                                846                 :     /* copy the MCV list header fields, one by one */
 1467                           847              90 :     memcpy(ptr, &mcvlist->magic, sizeof(uint32));
                                848              90 :     ptr += sizeof(uint32);
                                849                 : 
                                850              90 :     memcpy(ptr, &mcvlist->type, sizeof(uint32));
                                851              90 :     ptr += sizeof(uint32);
                                852                 : 
                                853              90 :     memcpy(ptr, &mcvlist->nitems, sizeof(uint32));
                                854              90 :     ptr += sizeof(uint32);
                                855                 : 
                                856              90 :     memcpy(ptr, &mcvlist->ndimensions, sizeof(AttrNumber));
                                857              90 :     ptr += sizeof(AttrNumber);
                                858                 : 
                                859              90 :     memcpy(ptr, mcvlist->types, sizeof(Oid) * ndims);
                                860              90 :     ptr += (sizeof(Oid) * ndims);
                                861                 : 
                                862                 :     /* store information about the attributes (data amounts, ...) */
 1473                           863              90 :     memcpy(ptr, info, sizeof(DimensionInfo) * ndims);
 1374                           864              90 :     ptr += sizeof(DimensionInfo) * ndims;
                                865                 : 
                                866                 :     /* Copy the deduplicated values for all attributes to the output. */
 1474                           867             345 :     for (dim = 0; dim < ndims; dim++)
                                868                 :     {
                                869                 :         /* remember the starting point for Asserts later */
 1473                           870             255 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
                                871                 : 
 1474                           872            7374 :         for (i = 0; i < info[dim].nvalues; i++)
                                873                 :         {
 1473                           874            7119 :             Datum       value = values[dim][i];
                                875                 : 
 1474                           876            7119 :             if (info[dim].typbyval) /* passed by value */
                                877                 :             {
                                878                 :                 Datum       tmp;
                                879                 : 
                                880                 :                 /*
                                881                 :                  * For byval types, we need to copy just the significant bytes
                                882                 :                  * - we can't use memcpy directly, as that assumes
                                883                 :                  * little-endian behavior.  store_att_byval does almost what
                                884                 :                  * we need, but it requires a properly aligned buffer - the
                                885                 :                  * output buffer does not guarantee that. So we simply use a
                                886                 :                  * local Datum variable (which guarantees proper alignment),
                                887                 :                  * and then copy the value from it.
                                888                 :                  */
 1473                           889            5181 :                 store_att_byval(&tmp, value, info[dim].typlen);
                                890                 : 
                                891            5181 :                 memcpy(ptr, &tmp, info[dim].typlen);
                                892            5181 :                 ptr += info[dim].typlen;
                                893                 :             }
 1414 akapila                   894            1938 :             else if (info[dim].typlen > 0)   /* passed by reference */
                                895                 :             {
                                896                 :                 /* no special alignment needed, treated as char array */
 1473 tomas.vondra              897             555 :                 memcpy(ptr, DatumGetPointer(value), info[dim].typlen);
                                898             555 :                 ptr += info[dim].typlen;
                                899                 :             }
 1474                           900            1383 :             else if (info[dim].typlen == -1)    /* varlena */
                                901                 :             {
 1374                           902            1383 :                 uint32      len = VARSIZE_ANY_EXHDR(DatumGetPointer(value));
                                903                 : 
                                904                 :                 /* copy the length */
                                905            1383 :                 memcpy(ptr, &len, sizeof(uint32));
                                906            1383 :                 ptr += sizeof(uint32);
                                907                 : 
                                908                 :                 /* data from the varlena value (without the header) */
                                909            1383 :                 memcpy(ptr, VARDATA_ANY(DatumGetPointer(value)), len);
                                910            1383 :                 ptr += len;
                                911                 :             }
 1474 tomas.vondra              912 UBC           0 :             else if (info[dim].typlen == -2)    /* cstring */
                                913                 :             {
 1374                           914               0 :                 uint32      len = (uint32) strlen(DatumGetCString(value)) + 1;
                                915                 : 
                                916                 :                 /* copy the length */
                                917               0 :                 memcpy(ptr, &len, sizeof(uint32));
                                918               0 :                 ptr += sizeof(uint32);
                                919                 : 
                                920                 :                 /* value */
 1473                           921               0 :                 memcpy(ptr, DatumGetCString(value), len);
 1374                           922               0 :                 ptr += len;
                                923                 :             }
                                924                 : 
                                925                 :             /* no underflows or overflows */
 1473 tomas.vondra              926 CBC        7119 :             Assert((ptr > start) && ((ptr - start) <= info[dim].nbytes));
                                927                 :         }
                                928                 : 
                                929                 :         /* we should get exactly nbytes of data for this dimension */
                                930             255 :         Assert((ptr - start) == info[dim].nbytes);
                                931                 :     }
                                932                 : 
                                933                 :     /* Serialize the items, with uint16 indexes instead of the values. */
 1474                           934            4386 :     for (i = 0; i < mcvlist->nitems; i++)
                                935                 :     {
 1473                           936            4296 :         MCVItem    *mcvitem = &mcvlist->items[i];
                                937                 : 
                                938                 :         /* don't write beyond the allocated space */
 1374                           939            4296 :         Assert(ptr <= (endptr - ITEM_SIZE(dim)));
                                940                 : 
                                941                 :         /* copy NULL and frequency flags into the serialized MCV */
                                942            4296 :         memcpy(ptr, mcvitem->isnull, sizeof(bool) * ndims);
                                943            4296 :         ptr += sizeof(bool) * ndims;
                                944                 : 
                                945            4296 :         memcpy(ptr, &mcvitem->frequency, sizeof(double));
                                946            4296 :         ptr += sizeof(double);
                                947                 : 
                                948            4296 :         memcpy(ptr, &mcvitem->base_frequency, sizeof(double));
                                949            4296 :         ptr += sizeof(double);
                                950                 : 
                                951                 :         /* store the indexes last */
 1474                           952           16425 :         for (dim = 0; dim < ndims; dim++)
                                953                 :         {
 1374                           954           12129 :             uint16      index = 0;
                                955                 :             Datum      *value;
                                956                 : 
                                957                 :             /* do the lookup only for non-NULL values */
                                958           12129 :             if (!mcvitem->isnull[dim])
                                959                 :             {
                                960           12096 :                 value = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
                                961           12096 :                                               info[dim].nvalues, sizeof(Datum),
                                962           12096 :                                               compare_scalars_simple, &ssup[dim]);
                                963                 : 
 1060 tgl                       964           12096 :                 Assert(value != NULL);  /* serialization or deduplication
                                965                 :                                          * error */
                                966                 : 
                                967                 :                 /* compute index within the deduplicated array */
 1374 tomas.vondra              968           12096 :                 index = (uint16) (value - values[dim]);
                                969                 : 
                                970                 :                 /* check the index is within expected bounds */
                                971           12096 :                 Assert(index < info[dim].nvalues);
                                972                 :             }
                                973                 : 
                                974                 :             /* copy the index into the serialized MCV */
                                975           12129 :             memcpy(ptr, &index, sizeof(uint16));
                                976           12129 :             ptr += sizeof(uint16);
                                977                 :         }
                                978                 : 
                                979                 :         /* make sure we don't overflow the allocated value */
                                980            4296 :         Assert(ptr <= endptr);
                                981                 :     }
                                982                 : 
                                983                 :     /* at this point we expect to match the total_length exactly */
                                984              90 :     Assert(ptr == endptr);
                                985                 : 
 1474                           986              90 :     pfree(values);
                                987              90 :     pfree(counts);
                                988                 : 
 1374                           989              90 :     return raw;
                                990                 : }
                                991                 : 
                                992                 : /*
                                993                 :  * statext_mcv_deserialize
                                994                 :  *      Reads serialized MCV list into MCVList structure.
                                995                 :  *
                                996                 :  * All the memory needed by the MCV list is allocated as a single chunk, so
                                997                 :  * it's possible to simply pfree() it at once.
                                998                 :  */
                                999                 : MCVList *
 1474                          1000             246 : statext_mcv_deserialize(bytea *data)
                               1001                 : {
                               1002                 :     int         dim,
                               1003                 :                 i;
                               1004                 :     Size        expected_size;
                               1005                 :     MCVList    *mcvlist;
                               1006                 :     char       *raw;
                               1007                 :     char       *ptr;
                               1008                 :     char       *endptr PG_USED_FOR_ASSERTS_ONLY;
                               1009                 : 
                               1010                 :     int         ndims,
                               1011                 :                 nitems;
                               1012             246 :     DimensionInfo *info = NULL;
                               1013                 : 
                               1014                 :     /* local allocation buffer (used only for deserialization) */
 1473                          1015             246 :     Datum     **map = NULL;
                               1016                 : 
                               1017                 :     /* MCV list */
                               1018                 :     Size        mcvlen;
                               1019                 : 
                               1020                 :     /* buffer used for the result */
                               1021                 :     Size        datalen;
                               1022                 :     char       *dataptr;
                               1023                 :     char       *valuesptr;
                               1024                 :     char       *isnullptr;
                               1025                 : 
 1474                          1026             246 :     if (data == NULL)
 1474 tomas.vondra             1027 UBC           0 :         return NULL;
                               1028                 : 
                               1029                 :     /*
                               1030                 :      * We can't possibly deserialize a MCV list if there's not even a complete
                               1031                 :      * header. We need an explicit formula here, because we serialize the
                               1032                 :      * header fields one by one, so we need to ignore struct alignment.
                               1033                 :      */
 1467 tomas.vondra             1034 CBC         246 :     if (VARSIZE_ANY(data) < MinSizeOfMCVList)
   44 peter                    1035 UNC           0 :         elog(ERROR, "invalid MCV size %zu (expected at least %zu)",
                               1036                 :              VARSIZE_ANY(data), MinSizeOfMCVList);
                               1037                 : 
                               1038                 :     /* read the MCV list header */
 1473 tomas.vondra             1039 CBC         246 :     mcvlist = (MCVList *) palloc0(offsetof(MCVList, items));
                               1040                 : 
                               1041                 :     /* pointer to the data part (skip the varlena header) */
 1467                          1042             246 :     raw = (char *) data;
 1374                          1043             246 :     ptr = VARDATA_ANY(raw);
                               1044             246 :     endptr = (char *) raw + VARSIZE_ANY(data);
                               1045                 : 
                               1046                 :     /* get the header and perform further sanity checks */
 1467                          1047             246 :     memcpy(&mcvlist->magic, ptr, sizeof(uint32));
                               1048             246 :     ptr += sizeof(uint32);
                               1049                 : 
                               1050             246 :     memcpy(&mcvlist->type, ptr, sizeof(uint32));
                               1051             246 :     ptr += sizeof(uint32);
                               1052                 : 
                               1053             246 :     memcpy(&mcvlist->nitems, ptr, sizeof(uint32));
                               1054             246 :     ptr += sizeof(uint32);
                               1055                 : 
                               1056             246 :     memcpy(&mcvlist->ndimensions, ptr, sizeof(AttrNumber));
                               1057             246 :     ptr += sizeof(AttrNumber);
                               1058                 : 
 1474                          1059             246 :     if (mcvlist->magic != STATS_MCV_MAGIC)
 1474 tomas.vondra             1060 UBC           0 :         elog(ERROR, "invalid MCV magic %u (expected %u)",
                               1061                 :              mcvlist->magic, STATS_MCV_MAGIC);
                               1062                 : 
 1474 tomas.vondra             1063 CBC         246 :     if (mcvlist->type != STATS_MCV_TYPE_BASIC)
 1474 tomas.vondra             1064 UBC           0 :         elog(ERROR, "invalid MCV type %u (expected %u)",
                               1065                 :              mcvlist->type, STATS_MCV_TYPE_BASIC);
                               1066                 : 
 1474 tomas.vondra             1067 CBC         246 :     if (mcvlist->ndimensions == 0)
 1474 tomas.vondra             1068 UBC           0 :         elog(ERROR, "invalid zero-length dimension array in MCVList");
 1474 tomas.vondra             1069 CBC         246 :     else if ((mcvlist->ndimensions > STATS_MAX_DIMENSIONS) ||
                               1070             246 :              (mcvlist->ndimensions < 0))
 1474 tomas.vondra             1071 UBC           0 :         elog(ERROR, "invalid length (%d) dimension array in MCVList",
                               1072                 :              mcvlist->ndimensions);
                               1073                 : 
 1474 tomas.vondra             1074 CBC         246 :     if (mcvlist->nitems == 0)
 1474 tomas.vondra             1075 UBC           0 :         elog(ERROR, "invalid zero-length item array in MCVList");
 1474 tomas.vondra             1076 CBC         246 :     else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
 1474 tomas.vondra             1077 UBC           0 :         elog(ERROR, "invalid length (%u) item array in MCVList",
                               1078                 :              mcvlist->nitems);
                               1079                 : 
 1474 tomas.vondra             1080 CBC         246 :     nitems = mcvlist->nitems;
                               1081             246 :     ndims = mcvlist->ndimensions;
                               1082                 : 
                               1083                 :     /*
                               1084                 :      * Check amount of data including DimensionInfo for all dimensions and
                               1085                 :      * also the serialized items (including uint16 indexes). Also, walk
                               1086                 :      * through the dimension information and add it to the sum.
                               1087                 :      */
 1467                          1088             246 :     expected_size = SizeOfMCVList(ndims, nitems);
                               1089                 : 
                               1090                 :     /*
                               1091                 :      * Check that we have at least the dimension and info records, along with
                               1092                 :      * the items. We don't know the size of the serialized values yet. We need
                               1093                 :      * to do this check first, before accessing the dimension info.
                               1094                 :      */
                               1095             246 :     if (VARSIZE_ANY(data) < expected_size)
   44 peter                    1096 UNC           0 :         elog(ERROR, "invalid MCV size %zu (expected %zu)",
                               1097                 :              VARSIZE_ANY(data), expected_size);
                               1098                 : 
                               1099                 :     /* Now copy the array of type Oids. */
 1454 tomas.vondra             1100 CBC         246 :     memcpy(mcvlist->types, ptr, sizeof(Oid) * ndims);
 1467                          1101             246 :     ptr += (sizeof(Oid) * ndims);
                               1102                 : 
                               1103                 :     /* Now it's safe to access the dimension info. */
 1374                          1104             246 :     info = palloc(ndims * sizeof(DimensionInfo));
                               1105                 : 
                               1106             246 :     memcpy(info, ptr, ndims * sizeof(DimensionInfo));
                               1107             246 :     ptr += (ndims * sizeof(DimensionInfo));
                               1108                 : 
                               1109                 :     /* account for the value arrays */
 1474                          1110            1059 :     for (dim = 0; dim < ndims; dim++)
                               1111                 :     {
                               1112                 :         /*
                               1113                 :          * XXX I wonder if we can/should rely on asserts here. Maybe those
                               1114                 :          * checks should be done every time?
                               1115                 :          */
                               1116             813 :         Assert(info[dim].nvalues >= 0);
                               1117             813 :         Assert(info[dim].nbytes >= 0);
                               1118                 : 
 1374                          1119             813 :         expected_size += info[dim].nbytes;
                               1120                 :     }
                               1121                 : 
                               1122                 :     /*
                               1123                 :      * Now we know the total expected MCV size, including all the pieces
                               1124                 :      * (header, dimension info. items and deduplicated data). So do the final
                               1125                 :      * check on size.
                               1126                 :      */
 1467                          1127             246 :     if (VARSIZE_ANY(data) != expected_size)
   44 peter                    1128 UNC           0 :         elog(ERROR, "invalid MCV size %zu (expected %zu)",
                               1129                 :              VARSIZE_ANY(data), expected_size);
                               1130                 : 
                               1131                 :     /*
                               1132                 :      * We need an array of Datum values for each dimension, so that we can
                               1133                 :      * easily translate the uint16 indexes later. We also need a top-level
                               1134                 :      * array of pointers to those per-dimension arrays.
                               1135                 :      *
                               1136                 :      * While allocating the arrays for dimensions, compute how much space we
                               1137                 :      * need for a copy of the by-ref data, as we can't simply point to the
                               1138                 :      * original values (it might go away).
                               1139                 :      */
 1473 tomas.vondra             1140 CBC         246 :     datalen = 0;                /* space for by-ref data */
 1469 michael                  1141             246 :     map = (Datum **) palloc(ndims * sizeof(Datum *));
                               1142                 : 
 1474 tomas.vondra             1143            1059 :     for (dim = 0; dim < ndims; dim++)
                               1144                 :     {
 1473                          1145             813 :         map[dim] = (Datum *) palloc(sizeof(Datum) * info[dim].nvalues);
                               1146                 : 
                               1147                 :         /* space needed for a copy of data for by-ref types */
 1374                          1148             813 :         datalen += info[dim].nbytes_aligned;
                               1149                 :     }
                               1150                 : 
                               1151                 :     /*
                               1152                 :      * Now resize the MCV list so that the allocation includes all the data.
                               1153                 :      *
                               1154                 :      * Allocate space for a copy of the data, as we can't simply reference the
                               1155                 :      * serialized data - it's not aligned properly, and it may disappear while
                               1156                 :      * we're still using the MCV list, e.g. due to catcache release.
                               1157                 :      *
                               1158                 :      * We do care about alignment here, because we will allocate all the
                               1159                 :      * pieces at once, but then use pointers to different parts.
                               1160                 :      */
 1472                          1161             246 :     mcvlen = MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
                               1162                 : 
                               1163                 :     /* arrays of values and isnull flags for all MCV items */
 1471                          1164             246 :     mcvlen += nitems * MAXALIGN(sizeof(Datum) * ndims);
                               1165             246 :     mcvlen += nitems * MAXALIGN(sizeof(bool) * ndims);
                               1166                 : 
                               1167                 :     /* we don't quite need to align this, but it makes some asserts easier */
 1472                          1168             246 :     mcvlen += MAXALIGN(datalen);
                               1169                 : 
                               1170                 :     /* now resize the deserialized MCV list, and compute pointers to parts */
 1473                          1171             246 :     mcvlist = repalloc(mcvlist, mcvlen);
                               1172                 : 
                               1173                 :     /* pointer to the beginning of values/isnull arrays */
 1472                          1174             246 :     valuesptr = (char *) mcvlist
                               1175             246 :         + MAXALIGN(offsetof(MCVList, items) + (sizeof(MCVItem) * nitems));
                               1176                 : 
 1471                          1177             246 :     isnullptr = valuesptr + (nitems * MAXALIGN(sizeof(Datum) * ndims));
                               1178                 : 
                               1179             246 :     dataptr = isnullptr + (nitems * MAXALIGN(sizeof(bool) * ndims));
                               1180                 : 
                               1181                 :     /*
                               1182                 :      * Build mapping (index => value) for translating the serialized data into
                               1183                 :      * the in-memory representation.
                               1184                 :      */
 1474                          1185            1059 :     for (dim = 0; dim < ndims; dim++)
                               1186                 :     {
                               1187                 :         /* remember start position in the input array */
 1473                          1188             813 :         char       *start PG_USED_FOR_ASSERTS_ONLY = ptr;
                               1189                 : 
 1474                          1190             813 :         if (info[dim].typbyval)
                               1191                 :         {
                               1192                 :             /* for by-val types we simply copy data into the mapping */
 1473                          1193           21963 :             for (i = 0; i < info[dim].nvalues; i++)
                               1194                 :             {
                               1195           21429 :                 Datum       v = 0;
                               1196                 : 
                               1197           21429 :                 memcpy(&v, ptr, info[dim].typlen);
                               1198           21429 :                 ptr += info[dim].typlen;
                               1199                 : 
                               1200           21429 :                 map[dim][i] = fetch_att(&v, true, info[dim].typlen);
                               1201                 : 
                               1202                 :                 /* no under/overflow of input array */
                               1203           21429 :                 Assert(ptr <= (start + info[dim].nbytes));
                               1204                 :             }
                               1205                 :         }
                               1206                 :         else
                               1207                 :         {
                               1208                 :             /* for by-ref types we need to also make a copy of the data */
                               1209                 : 
                               1210                 :             /* passed by reference, but fixed length (name, tid, ...) */
 1474                          1211             279 :             if (info[dim].typlen > 0)
                               1212                 :             {
                               1213            1101 :                 for (i = 0; i < info[dim].nvalues; i++)
                               1214                 :                 {
 1473                          1215            1080 :                     memcpy(dataptr, ptr, info[dim].typlen);
                               1216            1080 :                     ptr += info[dim].typlen;
                               1217                 : 
                               1218                 :                     /* just point into the array */
                               1219            1080 :                     map[dim][i] = PointerGetDatum(dataptr);
 1374                          1220            1080 :                     dataptr += MAXALIGN(info[dim].typlen);
                               1221                 :                 }
                               1222                 :             }
 1474                          1223             258 :             else if (info[dim].typlen == -1)
                               1224                 :             {
                               1225                 :                 /* varlena */
                               1226            7494 :                 for (i = 0; i < info[dim].nvalues; i++)
                               1227                 :                 {
                               1228                 :                     uint32      len;
                               1229                 : 
                               1230                 :                     /* read the uint32 length */
 1374                          1231            7236 :                     memcpy(&len, ptr, sizeof(uint32));
                               1232            7236 :                     ptr += sizeof(uint32);
                               1233                 : 
                               1234                 :                     /* the length is data-only */
                               1235            7236 :                     SET_VARSIZE(dataptr, len + VARHDRSZ);
                               1236            7236 :                     memcpy(VARDATA(dataptr), ptr, len);
                               1237            7236 :                     ptr += len;
                               1238                 : 
                               1239                 :                     /* just point into the array */
 1473                          1240            7236 :                     map[dim][i] = PointerGetDatum(dataptr);
                               1241                 : 
                               1242                 :                     /* skip to place of the next deserialized value */
 1374                          1243            7236 :                     dataptr += MAXALIGN(len + VARHDRSZ);
                               1244                 :                 }
                               1245                 :             }
 1474 tomas.vondra             1246 UBC           0 :             else if (info[dim].typlen == -2)
                               1247                 :             {
                               1248                 :                 /* cstring */
                               1249               0 :                 for (i = 0; i < info[dim].nvalues; i++)
                               1250                 :                 {
                               1251                 :                     uint32      len;
                               1252                 : 
 1374                          1253               0 :                     memcpy(&len, ptr, sizeof(uint32));
                               1254               0 :                     ptr += sizeof(uint32);
                               1255                 : 
 1473                          1256               0 :                     memcpy(dataptr, ptr, len);
 1374                          1257               0 :                     ptr += len;
                               1258                 : 
                               1259                 :                     /* just point into the array */
 1473                          1260               0 :                     map[dim][i] = PointerGetDatum(dataptr);
 1472                          1261               0 :                     dataptr += MAXALIGN(len);
                               1262                 :                 }
                               1263                 :             }
                               1264                 : 
                               1265                 :             /* no under/overflow of input array */
 1473 tomas.vondra             1266 CBC         279 :             Assert(ptr <= (start + info[dim].nbytes));
                               1267                 : 
                               1268                 :             /* no overflow of the output mcv value */
                               1269             279 :             Assert(dataptr <= ((char *) mcvlist + mcvlen));
                               1270                 :         }
                               1271                 : 
                               1272                 :         /* check we consumed input data for this dimension exactly */
                               1273             813 :         Assert(ptr == (start + info[dim].nbytes));
                               1274                 :     }
                               1275                 : 
                               1276                 :     /* we should have also filled the MCV list exactly */
                               1277             246 :     Assert(dataptr == ((char *) mcvlist + mcvlen));
                               1278                 : 
                               1279                 :     /* deserialize the MCV items and translate the indexes to Datums */
 1474                          1280           15219 :     for (i = 0; i < nitems; i++)
                               1281                 :     {
 1473                          1282           14973 :         MCVItem    *item = &mcvlist->items[i];
                               1283                 : 
                               1284           14973 :         item->values = (Datum *) valuesptr;
 1471                          1285           14973 :         valuesptr += MAXALIGN(sizeof(Datum) * ndims);
                               1286                 : 
                               1287           14973 :         item->isnull = (bool *) isnullptr;
                               1288           14973 :         isnullptr += MAXALIGN(sizeof(bool) * ndims);
                               1289                 : 
 1374                          1290           14973 :         memcpy(item->isnull, ptr, sizeof(bool) * ndims);
                               1291           14973 :         ptr += sizeof(bool) * ndims;
                               1292                 : 
                               1293           14973 :         memcpy(&item->frequency, ptr, sizeof(double));
                               1294           14973 :         ptr += sizeof(double);
                               1295                 : 
                               1296           14973 :         memcpy(&item->base_frequency, ptr, sizeof(double));
                               1297           14973 :         ptr += sizeof(double);
                               1298                 : 
                               1299                 :         /* finally translate the indexes (for non-NULL only) */
 1474                          1300           66894 :         for (dim = 0; dim < ndims; dim++)
                               1301                 :         {
                               1302                 :             uint16      index;
                               1303                 : 
 1374                          1304           51921 :             memcpy(&index, ptr, sizeof(uint16));
                               1305           51921 :             ptr += sizeof(uint16);
                               1306                 : 
                               1307           51921 :             if (item->isnull[dim])
                               1308             153 :                 continue;
                               1309                 : 
                               1310           51768 :             item->values[dim] = map[dim][index];
                               1311                 :         }
                               1312                 : 
                               1313                 :         /* check we're not overflowing the input */
                               1314           14973 :         Assert(ptr <= endptr);
                               1315                 :     }
                               1316                 : 
                               1317                 :     /* check that we processed all the data */
                               1318             246 :     Assert(ptr == endptr);
                               1319                 : 
                               1320                 :     /* release the buffers used for mapping */
 1473                          1321            1059 :     for (dim = 0; dim < ndims; dim++)
                               1322             813 :         pfree(map[dim]);
                               1323                 : 
                               1324             246 :     pfree(map);
                               1325                 : 
 1474                          1326             246 :     return mcvlist;
                               1327                 : }
                               1328                 : 
                               1329                 : /*
                               1330                 :  * SRF with details about buckets of a histogram:
                               1331                 :  *
                               1332                 :  * - item ID (0...nitems)
                               1333                 :  * - values (string array)
                               1334                 :  * - nulls only (boolean array)
                               1335                 :  * - frequency (double precision)
                               1336                 :  * - base_frequency (double precision)
                               1337                 :  *
                               1338                 :  * The input is the OID of the statistics, and there are no rows returned if
                               1339                 :  * the statistics contains no histogram.
                               1340                 :  */
                               1341                 : Datum
                               1342              15 : pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
                               1343                 : {
                               1344                 :     FuncCallContext *funcctx;
                               1345                 : 
                               1346                 :     /* stuff done only on the first call of the function */
                               1347              15 :     if (SRF_IS_FIRSTCALL())
                               1348                 :     {
                               1349                 :         MemoryContext oldcontext;
                               1350                 :         MCVList    *mcvlist;
                               1351                 :         TupleDesc   tupdesc;
                               1352                 : 
                               1353                 :         /* create a function context for cross-call persistence */
                               1354               6 :         funcctx = SRF_FIRSTCALL_INIT();
                               1355                 : 
                               1356                 :         /* switch to memory context appropriate for multiple function calls */
                               1357               6 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
                               1358                 : 
                               1359               6 :         mcvlist = statext_mcv_deserialize(PG_GETARG_BYTEA_P(0));
                               1360                 : 
                               1361               6 :         funcctx->user_fctx = mcvlist;
                               1362                 : 
                               1363                 :         /* total number of tuples to be returned */
                               1364               6 :         funcctx->max_calls = 0;
                               1365               6 :         if (funcctx->user_fctx != NULL)
                               1366               6 :             funcctx->max_calls = mcvlist->nitems;
                               1367                 : 
                               1368                 :         /* Build a tuple descriptor for our result type */
                               1369               6 :         if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
 1474 tomas.vondra             1370 UBC           0 :             ereport(ERROR,
                               1371                 :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                               1372                 :                      errmsg("function returning record called in context "
                               1373                 :                             "that cannot accept type record")));
 1375 tomas.vondra             1374 CBC           6 :         tupdesc = BlessTupleDesc(tupdesc);
                               1375                 : 
                               1376                 :         /*
                               1377                 :          * generate attribute metadata needed later to produce tuples from raw
                               1378                 :          * C strings
                               1379                 :          */
                               1380               6 :         funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
                               1381                 : 
 1474                          1382               6 :         MemoryContextSwitchTo(oldcontext);
                               1383                 :     }
                               1384                 : 
                               1385                 :     /* stuff done on every call of the function */
                               1386              15 :     funcctx = SRF_PERCALL_SETUP();
                               1387                 : 
 1060 tgl                      1388              15 :     if (funcctx->call_cntr < funcctx->max_calls)   /* do when there is more
                               1389                 :                                                      * left to send */
                               1390                 :     {
                               1391                 :         Datum       values[5];
                               1392                 :         bool        nulls[5];
                               1393                 :         HeapTuple   tuple;
                               1394                 :         Datum       result;
 1375 tomas.vondra             1395               9 :         ArrayBuildState *astate_values = NULL;
                               1396               9 :         ArrayBuildState *astate_nulls = NULL;
                               1397                 : 
                               1398                 :         int         i;
                               1399                 :         MCVList    *mcvlist;
                               1400                 :         MCVItem    *item;
                               1401                 : 
 1474                          1402               9 :         mcvlist = (MCVList *) funcctx->user_fctx;
                               1403                 : 
 1375                          1404               9 :         Assert(funcctx->call_cntr < mcvlist->nitems);
                               1405                 : 
                               1406               9 :         item = &mcvlist->items[funcctx->call_cntr];
                               1407                 : 
 1474                          1408              36 :         for (i = 0; i < mcvlist->ndimensions; i++)
                               1409                 :         {
                               1410                 : 
 1375                          1411              27 :             astate_nulls = accumArrayResult(astate_nulls,
 1060 tgl                      1412              27 :                                             BoolGetDatum(item->isnull[i]),
                               1413                 :                                             false,
                               1414                 :                                             BOOLOID,
                               1415                 :                                             CurrentMemoryContext);
                               1416                 : 
 1375 tomas.vondra             1417              27 :             if (!item->isnull[i])
                               1418                 :             {
                               1419                 :                 bool        isvarlena;
                               1420                 :                 Oid         outfunc;
                               1421                 :                 FmgrInfo    fmgrinfo;
                               1422                 :                 Datum       val;
                               1423                 :                 text       *txt;
                               1424                 : 
                               1425                 :                 /* lookup output func for the type */
                               1426              15 :                 getTypeOutputInfo(mcvlist->types[i], &outfunc, &isvarlena);
                               1427              15 :                 fmgr_info(outfunc, &fmgrinfo);
                               1428                 : 
                               1429              15 :                 val = FunctionCall1(&fmgrinfo, item->values[i]);
                               1430              15 :                 txt = cstring_to_text(DatumGetPointer(val));
                               1431                 : 
                               1432              15 :                 astate_values = accumArrayResult(astate_values,
                               1433                 :                                                  PointerGetDatum(txt),
                               1434                 :                                                  false,
                               1435                 :                                                  TEXTOID,
                               1436                 :                                                  CurrentMemoryContext);
                               1437                 :             }
                               1438                 :             else
                               1439              12 :                 astate_values = accumArrayResult(astate_values,
                               1440                 :                                                  (Datum) 0,
                               1441                 :                                                  true,
                               1442                 :                                                  TEXTOID,
                               1443                 :                                                  CurrentMemoryContext);
                               1444                 :         }
                               1445                 : 
                               1446               9 :         values[0] = Int32GetDatum(funcctx->call_cntr);
  224 peter                    1447 GNC           9 :         values[1] = makeArrayResult(astate_values, CurrentMemoryContext);
                               1448               9 :         values[2] = makeArrayResult(astate_nulls, CurrentMemoryContext);
 1375 tomas.vondra             1449 CBC           9 :         values[3] = Float8GetDatum(item->frequency);
                               1450               9 :         values[4] = Float8GetDatum(item->base_frequency);
                               1451                 : 
                               1452                 :         /* no NULLs in the tuple */
                               1453               9 :         memset(nulls, 0, sizeof(nulls));
                               1454                 : 
                               1455                 :         /* build a tuple */
                               1456               9 :         tuple = heap_form_tuple(funcctx->attinmeta->tupdesc, values, nulls);
                               1457                 : 
                               1458                 :         /* make the tuple into a datum */
 1474                          1459               9 :         result = HeapTupleGetDatum(tuple);
                               1460                 : 
                               1461               9 :         SRF_RETURN_NEXT(funcctx, result);
                               1462                 :     }
                               1463                 :     else                        /* do when there is no more left */
                               1464                 :     {
                               1465               6 :         SRF_RETURN_DONE(funcctx);
                               1466                 :     }
                               1467                 : }
                               1468                 : 
                               1469                 : /*
                               1470                 :  * pg_mcv_list_in       - input routine for type pg_mcv_list.
                               1471                 :  *
                               1472                 :  * pg_mcv_list is real enough to be a table column, but it has no operations
                               1473                 :  * of its own, and disallows input too
                               1474                 :  */
                               1475                 : Datum
 1474 tomas.vondra             1476 UBC           0 : pg_mcv_list_in(PG_FUNCTION_ARGS)
                               1477                 : {
                               1478                 :     /*
                               1479                 :      * pg_mcv_list stores the data in binary form and parsing text input is
                               1480                 :      * not needed, so disallow this.
                               1481                 :      */
                               1482               0 :     ereport(ERROR,
                               1483                 :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                               1484                 :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
                               1485                 : 
                               1486                 :     PG_RETURN_VOID();           /* keep compiler quiet */
                               1487                 : }
                               1488                 : 
                               1489                 : 
                               1490                 : /*
                               1491                 :  * pg_mcv_list_out      - output routine for type pg_mcv_list.
                               1492                 :  *
                               1493                 :  * MCV lists are serialized into a bytea value, so we simply call byteaout()
                               1494                 :  * to serialize the value into text. But it'd be nice to serialize that into
                               1495                 :  * a meaningful representation (e.g. for inspection by people).
                               1496                 :  *
                               1497                 :  * XXX This should probably return something meaningful, similar to what
                               1498                 :  * pg_dependencies_out does. Not sure how to deal with the deduplicated
                               1499                 :  * values, though - do we want to expand that or not?
                               1500                 :  */
                               1501                 : Datum
                               1502               0 : pg_mcv_list_out(PG_FUNCTION_ARGS)
                               1503                 : {
                               1504               0 :     return byteaout(fcinfo);
                               1505                 : }
                               1506                 : 
                               1507                 : /*
                               1508                 :  * pg_mcv_list_recv     - binary input routine for type pg_mcv_list.
                               1509                 :  */
                               1510                 : Datum
                               1511               0 : pg_mcv_list_recv(PG_FUNCTION_ARGS)
                               1512                 : {
                               1513               0 :     ereport(ERROR,
                               1514                 :             (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                               1515                 :              errmsg("cannot accept a value of type %s", "pg_mcv_list")));
                               1516                 : 
                               1517                 :     PG_RETURN_VOID();           /* keep compiler quiet */
                               1518                 : }
                               1519                 : 
                               1520                 : /*
                               1521                 :  * pg_mcv_list_send     - binary output routine for type pg_mcv_list.
                               1522                 :  *
                               1523                 :  * MCV lists are serialized in a bytea value (although the type is named
                               1524                 :  * differently), so let's just send that.
                               1525                 :  */
                               1526                 : Datum
                               1527               0 : pg_mcv_list_send(PG_FUNCTION_ARGS)
                               1528                 : {
                               1529               0 :     return byteasend(fcinfo);
                               1530                 : }
                               1531                 : 
                               1532                 : /*
                               1533                 :  * match the attribute/expression to a dimension of the statistic
                               1534                 :  *
                               1535                 :  * Returns the zero-based index of the matching statistics dimension.
                               1536                 :  * Optionally determines the collation.
                               1537                 :  */
                               1538                 : static int
  744 tomas.vondra             1539 CBC         576 : mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
                               1540                 : {
                               1541                 :     int         idx;
                               1542                 : 
                               1543             576 :     if (IsA(expr, Var))
                               1544                 :     {
                               1545                 :         /* simple Var, so just lookup using varattno */
                               1546             453 :         Var        *var = (Var *) expr;
                               1547                 : 
                               1548             453 :         if (collid)
                               1549             420 :             *collid = var->varcollid;
                               1550                 : 
                               1551             453 :         idx = bms_member_index(keys, var->varattno);
                               1552                 : 
  247 tgl                      1553             453 :         if (idx < 0)
  247 tgl                      1554 UBC           0 :             elog(ERROR, "variable not found in statistics object");
                               1555                 :     }
                               1556                 :     else
                               1557                 :     {
                               1558                 :         /* expression - lookup in stats expressions */
                               1559                 :         ListCell   *lc;
                               1560                 : 
  744 tomas.vondra             1561 CBC         123 :         if (collid)
                               1562             120 :             *collid = exprCollation(expr);
                               1563                 : 
                               1564                 :         /* expressions are stored after the simple columns */
  247 tgl                      1565             123 :         idx = bms_num_members(keys);
  744 tomas.vondra             1566             228 :         foreach(lc, exprs)
                               1567                 :         {
                               1568             228 :             Node       *stat_expr = (Node *) lfirst(lc);
                               1569                 : 
                               1570             228 :             if (equal(expr, stat_expr))
                               1571             123 :                 break;
                               1572                 : 
                               1573             105 :             idx++;
                               1574                 :         }
                               1575                 : 
  247 tgl                      1576             123 :         if (lc == NULL)
  247 tgl                      1577 UBC           0 :             elog(ERROR, "expression not found in statistics object");
                               1578                 :     }
                               1579                 : 
  744 tomas.vondra             1580 CBC         576 :     return idx;
                               1581                 : }
                               1582                 : 
                               1583                 : /*
                               1584                 :  * mcv_get_match_bitmap
                               1585                 :  *  Evaluate clauses using the MCV list, and update the match bitmap.
                               1586                 :  *
                               1587                 :  * A match bitmap keeps match/mismatch status for each MCV item, and we
                               1588                 :  * update it based on additional clauses. We also use it to skip items
                               1589                 :  * that can't possibly match (e.g. item marked as "mismatch" can't change
                               1590                 :  * to "match" when evaluating AND clause list).
                               1591                 :  *
                               1592                 :  * The function also returns a flag indicating whether there was an
                               1593                 :  * equality condition for all attributes, the minimum frequency in the MCV
                               1594                 :  * list, and a total MCV frequency (sum of frequencies for all items).
                               1595                 :  *
                               1596                 :  * XXX Currently the match bitmap uses a bool for each MCV item, which is
                               1597                 :  * somewhat wasteful as we could do with just a single bit, thus reducing
                               1598                 :  * the size to ~1/8. It would also allow us to combine bitmaps simply using
                               1599                 :  * & and |, which should be faster than min/max. The bitmaps are fairly
                               1600                 :  * small, though (thanks to the cap on the MCV list size).
                               1601                 :  */
                               1602                 : static bool *
 1474                          1603             357 : mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
                               1604                 :                      Bitmapset *keys, List *exprs,
                               1605                 :                      MCVList *mcvlist, bool is_or)
                               1606                 : {
                               1607                 :     ListCell   *l;
                               1608                 :     bool       *matches;
                               1609                 : 
 1474 tomas.vondra             1610 ECB             :     /* The bitmap may be partially built. */
 1474 tomas.vondra             1611 CBC         357 :     Assert(clauses != NIL);
                               1612             357 :     Assert(mcvlist != NULL);
 1474 tomas.vondra             1613 GIC         357 :     Assert(mcvlist->nitems > 0);
 1474 tomas.vondra             1614 CBC         357 :     Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
 1474 tomas.vondra             1615 ECB             : 
 1474 tomas.vondra             1616 GIC         357 :     matches = palloc(sizeof(bool) * mcvlist->nitems);
  439 michael                  1617             357 :     memset(matches, !is_or, sizeof(bool) * mcvlist->nitems);
                               1618                 : 
                               1619                 :     /*
                               1620                 :      * Loop through the list of clauses, and for each of them evaluate all the
 1474 tomas.vondra             1621 ECB             :      * MCV items not yet eliminated by the preceding clauses.
                               1622                 :      */
 1474 tomas.vondra             1623 CBC        1017 :     foreach(l, clauses)
                               1624                 :     {
 1474 tomas.vondra             1625 GIC         660 :         Node       *clause = (Node *) lfirst(l);
 1474 tomas.vondra             1626 ECB             : 
                               1627                 :         /* if it's a RestrictInfo, then extract the clause */
 1474 tomas.vondra             1628 GIC         660 :         if (IsA(clause, RestrictInfo))
                               1629             609 :             clause = (Node *) ((RestrictInfo *) clause)->clause;
                               1630                 : 
                               1631                 :         /*
                               1632                 :          * Handle the various types of clauses - OpClause, NullTest and
 1474 tomas.vondra             1633 ECB             :          * AND/OR/NOT
                               1634                 :          */
 1474 tomas.vondra             1635 CBC         660 :         if (is_opclause(clause))
                               1636                 :         {
 1474 tomas.vondra             1637 GIC         438 :             OpExpr     *expr = (OpExpr *) clause;
                               1638                 :             FmgrInfo    opproc;
                               1639                 : 
                               1640                 :             /* valid only after examine_opclause_args returns true */
                               1641                 :             Node       *clause_expr;
                               1642                 :             Const      *cst;
                               1643                 :             bool        expronleft;
                               1644                 :             int         idx;
  744 tomas.vondra             1645 ECB             :             Oid         collid;
                               1646                 : 
 1366 tomas.vondra             1647 GIC         438 :             fmgr_info(get_opcode(expr->opno), &opproc);
 1474 tomas.vondra             1648 ECB             : 
  744 tomas.vondra             1649 EUB             :             /* extract the var/expr and const from the expression */
  744 tomas.vondra             1650 GIC         438 :             if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
  744 tomas.vondra             1651 UIC           0 :                 elog(ERROR, "incompatible clause");
  744 tomas.vondra             1652 ECB             : 
                               1653                 :             /* match the attribute/expression to a dimension of the statistic */
  744 tomas.vondra             1654 GIC         438 :             idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
                               1655                 : 
                               1656                 :             /*
                               1657                 :              * Walk through the MCV items and evaluate the current clause. We
                               1658                 :              * can skip items that were already ruled out, and terminate if
  744 tomas.vondra             1659 ECB             :              * there are no remaining MCV items that might possibly match.
                               1660                 :              */
  228 drowley                  1661 GNC       29316 :             for (int i = 0; i < mcvlist->nitems; i++)
 1474 tomas.vondra             1662 ECB             :             {
  744 tomas.vondra             1663 GIC       28878 :                 bool        match = true;
  744 tomas.vondra             1664 CBC       28878 :                 MCVItem    *item = &mcvlist->items[i];
                               1665                 : 
  744 tomas.vondra             1666 GIC       28878 :                 Assert(idx >= 0);
                               1667                 : 
                               1668                 :                 /*
                               1669                 :                  * When the MCV item or the Const value is NULL we can treat
                               1670                 :                  * this as a mismatch. We must not call the operator because
  744 tomas.vondra             1671 ECB             :                  * of strictness.
                               1672                 :                  */
  744 tomas.vondra             1673 CBC       28878 :                 if (item->isnull[idx] || cst->constisnull)
 1474 tomas.vondra             1674 ECB             :                 {
  744 tomas.vondra             1675 GIC          24 :                     matches[i] = RESULT_MERGE(matches[i], is_or, false);
                               1676              24 :                     continue;
                               1677                 :                 }
                               1678                 : 
                               1679                 :                 /*
                               1680                 :                  * Skip MCV items that can't change result in the bitmap. Once
                               1681                 :                  * the value gets false for AND-lists, or true for OR-lists,
  744 tomas.vondra             1682 ECB             :                  * we don't need to look at more clauses.
                               1683                 :                  */
  744 tomas.vondra             1684 GIC       28854 :                 if (RESULT_IS_FINAL(matches[i], is_or))
                               1685           13482 :                     continue;
                               1686                 : 
                               1687                 :                 /*
                               1688                 :                  * First check whether the constant is below the lower
                               1689                 :                  * boundary (in that case we can skip the bucket, because
                               1690                 :                  * there's no overlap).
                               1691                 :                  *
                               1692                 :                  * We don't store collations used to build the statistics, but
                               1693                 :                  * we can use the collation for the attribute itself, as
                               1694                 :                  * stored in varcollid. We do reset the statistics after a
                               1695                 :                  * type change (including collation change), so this is OK.
                               1696                 :                  * For expressions, we use the collation extracted from the
  744 tomas.vondra             1697 ECB             :                  * expression itself.
                               1698                 :                  */
  744 tomas.vondra             1699 GIC       15372 :                 if (expronleft)
  744 tomas.vondra             1700 CBC       14064 :                     match = DatumGetBool(FunctionCall2Coll(&opproc,
  744 tomas.vondra             1701 ECB             :                                                            collid,
  744 tomas.vondra             1702 GIC       14064 :                                                            item->values[idx],
  744 tomas.vondra             1703 CBC       14064 :                                                            cst->constvalue));
                               1704                 :                 else
                               1705            1308 :                     match = DatumGetBool(FunctionCall2Coll(&opproc,
  744 tomas.vondra             1706 ECB             :                                                            collid,
  744 tomas.vondra             1707 GIC        1308 :                                                            cst->constvalue,
                               1708            1308 :                                                            item->values[idx]));
  744 tomas.vondra             1709 ECB             : 
                               1710                 :                 /* update the match bitmap with the result */
  744 tomas.vondra             1711 GIC       15372 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
 1121 tomas.vondra             1712 ECB             :             }
                               1713                 :         }
 1121 tomas.vondra             1714 CBC         222 :         else if (IsA(clause, ScalarArrayOpExpr))
                               1715                 :         {
 1060 tgl                      1716 GIC         102 :             ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) clause;
                               1717                 :             FmgrInfo    opproc;
                               1718                 : 
                               1719                 :             /* valid only after examine_opclause_args returns true */
                               1720                 :             Node       *clause_expr;
                               1721                 :             Const      *cst;
                               1722                 :             bool        expronleft;
                               1723                 :             Oid         collid;
                               1724                 :             int         idx;
                               1725                 : 
                               1726                 :             /* array evaluation */
                               1727                 :             ArrayType  *arrayval;
                               1728                 :             int16       elmlen;
                               1729                 :             bool        elmbyval;
                               1730                 :             char        elmalign;
                               1731                 :             int         num_elems;
                               1732                 :             Datum      *elem_values;
  744 tomas.vondra             1733 ECB             :             bool       *elem_nulls;
                               1734                 : 
 1121 tomas.vondra             1735 GIC         102 :             fmgr_info(get_opcode(expr->opno), &opproc);
 1121 tomas.vondra             1736 ECB             : 
  744 tomas.vondra             1737 EUB             :             /* extract the var/expr and const from the expression */
  744 tomas.vondra             1738 GIC         102 :             if (!examine_opclause_args(expr->args, &clause_expr, &cst, &expronleft))
  744 tomas.vondra             1739 UIC           0 :                 elog(ERROR, "incompatible clause");
  744 tomas.vondra             1740 ECB             : 
  247 tgl                      1741 EUB             :             /* We expect Var on left */
  247 tgl                      1742 GIC         102 :             if (!expronleft)
  247 tgl                      1743 UIC           0 :                 elog(ERROR, "incompatible clause");
                               1744                 : 
                               1745                 :             /*
                               1746                 :              * Deconstruct the array constant, unless it's NULL (we'll cover
  247 tgl                      1747 ECB             :              * that case below)
                               1748                 :              */
  247 tgl                      1749 CBC         102 :             if (!cst->constisnull)
  247 tgl                      1750 ECB             :             {
  247 tgl                      1751 GIC         102 :                 arrayval = DatumGetArrayTypeP(cst->constvalue);
  247 tgl                      1752 CBC         102 :                 get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
                               1753                 :                                      &elmlen, &elmbyval, &elmalign);
  247 tgl                      1754 GIC         102 :                 deconstruct_array(arrayval,
                               1755                 :                                   ARR_ELEMTYPE(arrayval),
                               1756                 :                                   elmlen, elmbyval, elmalign,
                               1757                 :                                   &elem_values, &elem_nulls, &num_elems);
                               1758                 :             }
 1121 tomas.vondra             1759 ECB             : 
                               1760                 :             /* match the attribute/expression to a dimension of the statistic */
  744 tomas.vondra             1761 GIC         102 :             idx = mcv_match_expression(clause_expr, keys, exprs, &collid);
                               1762                 : 
                               1763                 :             /*
                               1764                 :              * Walk through the MCV items and evaluate the current clause. We
                               1765                 :              * can skip items that were already ruled out, and terminate if
  744 tomas.vondra             1766 ECB             :              * there are no remaining MCV items that might possibly match.
                               1767                 :              */
  228 drowley                  1768 GNC        7848 :             for (int i = 0; i < mcvlist->nitems; i++)
  744 tomas.vondra             1769 ECB             :             {
                               1770                 :                 int         j;
  578 michael                  1771 GIC        7746 :                 bool        match = !expr->useOr;
  744 tomas.vondra             1772            7746 :                 MCVItem    *item = &mcvlist->items[i];
                               1773                 : 
                               1774                 :                 /*
                               1775                 :                  * When the MCV item or the Const value is NULL we can treat
                               1776                 :                  * this as a mismatch. We must not call the operator because
  744 tomas.vondra             1777 ECB             :                  * of strictness.
                               1778                 :                  */
  744 tomas.vondra             1779 CBC        7746 :                 if (item->isnull[idx] || cst->constisnull)
 1121 tomas.vondra             1780 ECB             :                 {
  744 tomas.vondra             1781 GIC           9 :                     matches[i] = RESULT_MERGE(matches[i], is_or, false);
                               1782               9 :                     continue;
                               1783                 :                 }
                               1784                 : 
                               1785                 :                 /*
                               1786                 :                  * Skip MCV items that can't change result in the bitmap. Once
                               1787                 :                  * the value gets false for AND-lists, or true for OR-lists,
  744 tomas.vondra             1788 ECB             :                  * we don't need to look at more clauses.
 1121                          1789                 :                  */
  744 tomas.vondra             1790 GIC        7737 :                 if (RESULT_IS_FINAL(matches[i], is_or))
  744 tomas.vondra             1791 CBC        4008 :                     continue;
                               1792                 : 
                               1793           14169 :                 for (j = 0; j < num_elems; j++)
 1121 tomas.vondra             1794 ECB             :                 {
  744 tomas.vondra             1795 GIC       11817 :                     Datum       elem_value = elem_values[j];
                               1796           11817 :                     bool        elem_isnull = elem_nulls[j];
                               1797                 :                     bool        elem_match;
 1121 tomas.vondra             1798 ECB             : 
                               1799                 :                     /* NULL values always evaluate as not matching. */
  744 tomas.vondra             1800 CBC       11817 :                     if (elem_isnull)
 1121 tomas.vondra             1801 ECB             :                     {
  744 tomas.vondra             1802 GIC        1056 :                         match = RESULT_MERGE(match, expr->useOr, false);
 1121                          1803            1056 :                         continue;
                               1804                 :                     }
                               1805                 : 
                               1806                 :                     /*
                               1807                 :                      * Stop evaluating the array elements once we reach a
                               1808                 :                      * matching value that can't change - ALL() is the same as
  744 tomas.vondra             1809 ECB             :                      * AND-list, ANY() is the same as OR-list.
 1121                          1810                 :                      */
  744 tomas.vondra             1811 GIC       10761 :                     if (RESULT_IS_FINAL(match, expr->useOr))
  744 tomas.vondra             1812 CBC        1377 :                         break;
                               1813                 : 
                               1814            9384 :                     elem_match = DatumGetBool(FunctionCall2Coll(&opproc,
                               1815                 :                                                                 collid,
  744 tomas.vondra             1816 GIC        9384 :                                                                 item->values[idx],
  744 tomas.vondra             1817 ECB             :                                                                 elem_value));
                               1818                 : 
  744 tomas.vondra             1819 GIC        9384 :                     match = RESULT_MERGE(match, expr->useOr, elem_match);
                               1820                 :                 }
  744 tomas.vondra             1821 ECB             : 
                               1822                 :                 /* update the match bitmap with the result */
  744 tomas.vondra             1823 GIC        3729 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
 1474 tomas.vondra             1824 ECB             :             }
                               1825                 :         }
 1474 tomas.vondra             1826 CBC         120 :         else if (IsA(clause, NullTest))
 1474 tomas.vondra             1827 ECB             :         {
 1474 tomas.vondra             1828 GIC          33 :             NullTest   *expr = (NullTest *) clause;
  744                          1829              33 :             Node       *clause_expr = (Node *) (expr->arg);
 1474 tomas.vondra             1830 ECB             : 
                               1831                 :             /* match the attribute/expression to a dimension of the statistic */
  744 tomas.vondra             1832 GIC          33 :             int         idx = mcv_match_expression(clause_expr, keys, exprs, NULL);
                               1833                 : 
                               1834                 :             /*
                               1835                 :              * Walk through the MCV items and evaluate the current clause. We
                               1836                 :              * can skip items that were already ruled out, and terminate if
 1474 tomas.vondra             1837 ECB             :              * there are no remaining MCV items that might possibly match.
                               1838                 :              */
  228 drowley                  1839 GNC        3039 :             for (int i = 0; i < mcvlist->nitems; i++)
 1474 tomas.vondra             1840 ECB             :             {
 1474 tomas.vondra             1841 GIC        3006 :                 bool        match = false;  /* assume mismatch */
 1473                          1842            3006 :                 MCVItem    *item = &mcvlist->items[i];
 1474 tomas.vondra             1843 ECB             : 
                               1844                 :                 /* if the clause mismatches the MCV item, update the bitmap */
 1474 tomas.vondra             1845 CBC        3006 :                 switch (expr->nulltesttype)
 1474 tomas.vondra             1846 ECB             :                 {
 1474 tomas.vondra             1847 CBC        2106 :                     case IS_NULL:
 1474 tomas.vondra             1848 GIC        2106 :                         match = (item->isnull[idx]) ? true : match;
 1474 tomas.vondra             1849 CBC        2106 :                         break;
 1474 tomas.vondra             1850 ECB             : 
 1474 tomas.vondra             1851 CBC         900 :                     case IS_NOT_NULL:
 1474 tomas.vondra             1852 GIC         900 :                         match = (!item->isnull[idx]) ? true : match;
                               1853             900 :                         break;
                               1854                 :                 }
 1474 tomas.vondra             1855 ECB             : 
                               1856                 :                 /* now, update the match bitmap, depending on OR/AND type */
 1362 tomas.vondra             1857 GIC        3006 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
 1474 tomas.vondra             1858 ECB             :             }
                               1859                 :         }
 1474 tomas.vondra             1860 GIC          87 :         else if (is_orclause(clause) || is_andclause(clause))
                               1861              30 :         {
                               1862                 :             /* AND/OR clause, with all subclauses being compatible */
 1474 tomas.vondra             1863 ECB             : 
                               1864                 :             int         i;
 1474 tomas.vondra             1865 GIC          30 :             BoolExpr   *bool_clause = ((BoolExpr *) clause);
                               1866              30 :             List       *bool_clauses = bool_clause->args;
 1474 tomas.vondra             1867 ECB             : 
                               1868                 :             /* match/mismatch bitmap for each MCV item */
 1474 tomas.vondra             1869 CBC          30 :             bool       *bool_matches = NULL;
 1474 tomas.vondra             1870 ECB             : 
 1474 tomas.vondra             1871 GIC          30 :             Assert(bool_clauses != NIL);
                               1872              30 :             Assert(list_length(bool_clauses) >= 2);
 1474 tomas.vondra             1873 ECB             : 
                               1874                 :             /* build the match bitmap for the OR-clauses */
  744 tomas.vondra             1875 GIC          30 :             bool_matches = mcv_get_match_bitmap(root, bool_clauses, keys, exprs,
 1474                          1876              30 :                                                 mcvlist, is_orclause(clause));
                               1877                 : 
                               1878                 :             /*
                               1879                 :              * Merge the bitmap produced by mcv_get_match_bitmap into the
                               1880                 :              * current one. We need to consider if we're evaluating AND or OR
 1474 tomas.vondra             1881 ECB             :              * condition when merging the results.
                               1882                 :              */
 1474 tomas.vondra             1883 GIC        1878 :             for (i = 0; i < mcvlist->nitems; i++)
 1362 tomas.vondra             1884 CBC        1848 :                 matches[i] = RESULT_MERGE(matches[i], is_or, bool_matches[i]);
                               1885                 : 
 1474                          1886              30 :             pfree(bool_matches);
                               1887                 :         }
 1474 tomas.vondra             1888 GIC          57 :         else if (is_notclause(clause))
                               1889                 :         {
                               1890                 :             /* NOT clause, with all subclauses compatible */
 1474 tomas.vondra             1891 ECB             : 
                               1892                 :             int         i;
 1474 tomas.vondra             1893 GIC          15 :             BoolExpr   *not_clause = ((BoolExpr *) clause);
                               1894              15 :             List       *not_args = not_clause->args;
 1474 tomas.vondra             1895 ECB             : 
                               1896                 :             /* match/mismatch bitmap for each MCV item */
 1474 tomas.vondra             1897 CBC          15 :             bool       *not_matches = NULL;
 1474 tomas.vondra             1898 ECB             : 
 1474 tomas.vondra             1899 GIC          15 :             Assert(not_args != NIL);
                               1900              15 :             Assert(list_length(not_args) == 1);
 1474 tomas.vondra             1901 ECB             : 
                               1902                 :             /* build the match bitmap for the NOT-clause */
  744 tomas.vondra             1903 GIC          15 :             not_matches = mcv_get_match_bitmap(root, not_args, keys, exprs,
                               1904                 :                                                mcvlist, false);
                               1905                 : 
                               1906                 :             /*
                               1907                 :              * Merge the bitmap produced by mcv_get_match_bitmap into the
                               1908                 :              * current one. We're handling a NOT clause, so invert the result
 1362 tomas.vondra             1909 ECB             :              * before merging it into the global bitmap.
 1474                          1910                 :              */
 1474 tomas.vondra             1911 GIC          75 :             for (i = 0; i < mcvlist->nitems; i++)
 1362 tomas.vondra             1912 CBC          60 :                 matches[i] = RESULT_MERGE(matches[i], is_or, !not_matches[i]);
                               1913                 : 
 1474                          1914              15 :             pfree(not_matches);
                               1915                 :         }
 1474 tomas.vondra             1916 GIC          42 :         else if (IsA(clause, Var))
                               1917                 :         {
 1474 tomas.vondra             1918 ECB             :             /* Var (has to be a boolean Var, possibly from below NOT) */
                               1919                 : 
 1474 tomas.vondra             1920 GIC          39 :             Var        *var = (Var *) (clause);
 1474 tomas.vondra             1921 ECB             : 
                               1922                 :             /* match the attribute to a dimension of the statistic */
 1474 tomas.vondra             1923 CBC          39 :             int         idx = bms_member_index(keys, var->varattno);
                               1924                 : 
 1474 tomas.vondra             1925 GIC          39 :             Assert(var->vartype == BOOLOID);
                               1926                 : 
                               1927                 :             /*
                               1928                 :              * Walk through the MCV items and evaluate the current clause. We
                               1929                 :              * can skip items that were already ruled out, and terminate if
 1474 tomas.vondra             1930 ECB             :              * there are no remaining MCV items that might possibly match.
                               1931                 :              */
  228 drowley                  1932 GNC         189 :             for (int i = 0; i < mcvlist->nitems; i++)
 1474 tomas.vondra             1933 ECB             :             {
 1473 tomas.vondra             1934 GIC         150 :                 MCVItem    *item = &mcvlist->items[i];
 1474                          1935             150 :                 bool        match = false;
 1474 tomas.vondra             1936 ECB             : 
                               1937                 :                 /* if the item is NULL, it's a mismatch */
 1474 tomas.vondra             1938 GIC         150 :                 if (!item->isnull[idx] && DatumGetBool(item->values[idx]))
                               1939              75 :                     match = true;
 1474 tomas.vondra             1940 ECB             : 
                               1941                 :                 /* update the result bitmap */
 1362 tomas.vondra             1942 GIC         150 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
                               1943                 :             }
                               1944                 :         }
                               1945                 :         else
                               1946                 :         {
                               1947                 :             /* Otherwise, it must be a bare boolean-returning expression */
                               1948                 :             int         idx;
  247 tgl                      1949 ECB             : 
                               1950                 :             /* match the expression to a dimension of the statistic */
  247 tgl                      1951 GIC           3 :             idx = mcv_match_expression(clause, keys, exprs, NULL);
                               1952                 : 
                               1953                 :             /*
                               1954                 :              * Walk through the MCV items and evaluate the current clause. We
                               1955                 :              * can skip items that were already ruled out, and terminate if
  247 tgl                      1956 ECB             :              * there are no remaining MCV items that might possibly match.
                               1957                 :              */
  228 drowley                  1958 GNC         111 :             for (int i = 0; i < mcvlist->nitems; i++)
  247 tgl                      1959 ECB             :             {
                               1960                 :                 bool        match;
  247 tgl                      1961 GIC         108 :                 MCVItem    *item = &mcvlist->items[i];
  247 tgl                      1962 ECB             : 
                               1963                 :                 /* "match" just means it's bool TRUE */
  247 tgl                      1964 GIC         108 :                 match = !item->isnull[idx] && DatumGetBool(item->values[idx]);
  247 tgl                      1965 ECB             : 
                               1966                 :                 /* now, update the match bitmap, depending on OR/AND type */
  247 tgl                      1967 GIC         108 :                 matches[i] = RESULT_MERGE(matches[i], is_or, match);
                               1968                 :             }
                               1969                 :         }
 1474 tomas.vondra             1970 ECB             :     }
                               1971                 : 
 1474 tomas.vondra             1972 GIC         357 :     return matches;
                               1973                 : }
                               1974                 : 
                               1975                 : 
                               1976                 : /*
                               1977                 :  * mcv_combine_selectivities
                               1978                 :  *      Combine per-column and multi-column MCV selectivity estimates.
                               1979                 :  *
                               1980                 :  * simple_sel is a "simple" selectivity estimate (produced without using any
                               1981                 :  * extended statistics, essentially assuming independence of columns/clauses).
                               1982                 :  *
                               1983                 :  * mcv_sel and mcv_basesel are sums of the frequencies and base frequencies of
                               1984                 :  * all matching MCV items.  The difference (mcv_sel - mcv_basesel) is then
                               1985                 :  * essentially interpreted as a correction to be added to simple_sel, as
                               1986                 :  * described below.
                               1987                 :  *
                               1988                 :  * mcv_totalsel is the sum of the frequencies of all MCV items (not just the
                               1989                 :  * matching ones).  This is used as an upper bound on the portion of the
                               1990                 :  * selectivity estimates not covered by the MCV statistics.
                               1991                 :  *
                               1992                 :  * Note: While simple and base selectivities are defined in a quite similar
                               1993                 :  * way, the values are computed differently and are not therefore equal. The
                               1994                 :  * simple selectivity is computed as a product of per-clause estimates, while
                               1995                 :  * the base selectivity is computed by adding up base frequencies of matching
                               1996                 :  * items of the multi-column MCV list. So the values may differ for two main
                               1997                 :  * reasons - (a) the MCV list may not cover 100% of the data and (b) some of
                               1998                 :  * the MCV items did not match the estimated clauses.
                               1999                 :  *
                               2000                 :  * As both (a) and (b) reduce the base selectivity value, it generally holds
                               2001                 :  * that (simple_sel >= mcv_basesel). If the MCV list covers all the data, the
                               2002                 :  * values may be equal.
                               2003                 :  *
                               2004                 :  * So, other_sel = (simple_sel - mcv_basesel) is an estimate for the part not
                               2005                 :  * covered by the MCV list, and (mcv_sel - mcv_basesel) may be seen as a
                               2006                 :  * correction for the part covered by the MCV list. Those two statements are
                               2007                 :  * actually equivalent.
  857 dean.a.rasheed           2008 ECB             :  */
                               2009                 : Selectivity
  857 dean.a.rasheed           2010 GIC         336 : mcv_combine_selectivities(Selectivity simple_sel,
                               2011                 :                           Selectivity mcv_sel,
                               2012                 :                           Selectivity mcv_basesel,
                               2013                 :                           Selectivity mcv_totalsel)
                               2014                 : {
                               2015                 :     Selectivity other_sel;
                               2016                 :     Selectivity sel;
  857 dean.a.rasheed           2017 ECB             : 
                               2018                 :     /* estimated selectivity of values not covered by MCV matches */
  857 dean.a.rasheed           2019 GIC         336 :     other_sel = simple_sel - mcv_basesel;
                               2020             336 :     CLAMP_PROBABILITY(other_sel);
  857 dean.a.rasheed           2021 ECB             : 
                               2022                 :     /* this non-MCV selectivity cannot exceed 1 - mcv_totalsel */
  857 dean.a.rasheed           2023 GIC         336 :     if (other_sel > 1.0 - mcv_totalsel)
                               2024             222 :         other_sel = 1.0 - mcv_totalsel;
  857 dean.a.rasheed           2025 ECB             : 
                               2026                 :     /* overall selectivity is the sum of the MCV and non-MCV parts */
  857 dean.a.rasheed           2027 GIC         336 :     sel = mcv_sel + other_sel;
  857 dean.a.rasheed           2028 CBC         336 :     CLAMP_PROBABILITY(sel);
                               2029                 : 
  857 dean.a.rasheed           2030 GIC         336 :     return sel;
                               2031                 : }
                               2032                 : 
                               2033                 : 
                               2034                 : /*
                               2035                 :  * mcv_clauselist_selectivity
                               2036                 :  *      Use MCV statistics to estimate the selectivity of an implicitly-ANDed
                               2037                 :  *      list of clauses.
                               2038                 :  *
                               2039                 :  * This determines which MCV items match every clause in the list and returns
                               2040                 :  * the sum of the frequencies of those items.
                               2041                 :  *
                               2042                 :  * In addition, it returns the sum of the base frequencies of each of those
                               2043                 :  * items (that is the sum of the selectivities that each item would have if
                               2044                 :  * the columns were independent of one another), and the total selectivity of
                               2045                 :  * all the MCV items (not just the matching ones).  These are expected to be
                               2046                 :  * used together with a "simple" selectivity estimate (one based only on
                               2047                 :  * per-column statistics) to produce an overall selectivity estimate that
                               2048                 :  * makes use of both per-column and multi-column statistics --- see
                               2049                 :  * mcv_combine_selectivities().
 1474 tomas.vondra             2050 ECB             :  */
                               2051                 : Selectivity
 1474 tomas.vondra             2052 GIC         192 : mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
                               2053                 :                            List *clauses, int varRelid,
                               2054                 :                            JoinType jointype, SpecialJoinInfo *sjinfo,
                               2055                 :                            RelOptInfo *rel,
                               2056                 :                            Selectivity *basesel, Selectivity *totalsel)
                               2057                 : {
 1474 tomas.vondra             2058 ECB             :     int         i;
                               2059                 :     MCVList    *mcv;
 1474 tomas.vondra             2060 GIC         192 :     Selectivity s = 0.0;
  448                          2061             192 :     RangeTblEntry *rte = root->simple_rte_array[rel->relid];
 1474 tomas.vondra             2062 ECB             : 
                               2063                 :     /* match/mismatch bitmap for each MCV item */
 1474 tomas.vondra             2064 GIC         192 :     bool       *matches = NULL;
 1474 tomas.vondra             2065 ECB             : 
                               2066                 :     /* load the MCV list stored in the statistics object */
  448 tomas.vondra             2067 GIC         192 :     mcv = statext_mcv_load(stat->statOid, rte->inh);
 1474 tomas.vondra             2068 ECB             : 
                               2069                 :     /* build a match bitmap for the clauses */
  744 tomas.vondra             2070 GIC         192 :     matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
                               2071                 :                                    mcv, false);
 1474 tomas.vondra             2072 ECB             : 
                               2073                 :     /* sum frequencies for all the matching MCV items */
 1474 tomas.vondra             2074 CBC         192 :     *basesel = 0.0;
 1474 tomas.vondra             2075 GIC         192 :     *totalsel = 0.0;
 1474 tomas.vondra             2076 CBC       12552 :     for (i = 0; i < mcv->nitems; i++)
                               2077                 :     {
 1473                          2078           12360 :         *totalsel += mcv->items[i].frequency;
                               2079                 : 
 1474                          2080           12360 :         if (matches[i] != false)
 1474 tomas.vondra             2081 ECB             :         {
 1473 tomas.vondra             2082 GIC         231 :             *basesel += mcv->items[i].base_frequency;
                               2083             231 :             s += mcv->items[i].frequency;
                               2084                 :         }
 1474 tomas.vondra             2085 ECB             :     }
                               2086                 : 
 1474 tomas.vondra             2087 GIC         192 :     return s;
                               2088                 : }
                               2089                 : 
                               2090                 : 
                               2091                 : /*
                               2092                 :  * mcv_clause_selectivity_or
                               2093                 :  *      Use MCV statistics to estimate the selectivity of a clause that
                               2094                 :  *      appears in an ORed list of clauses.
                               2095                 :  *
                               2096                 :  * As with mcv_clauselist_selectivity() this determines which MCV items match
                               2097                 :  * the clause and returns both the sum of the frequencies and the sum of the
                               2098                 :  * base frequencies of those items, as well as the sum of the frequencies of
                               2099                 :  * all MCV items (not just the matching ones) so that this information can be
                               2100                 :  * used by mcv_combine_selectivities() to produce a selectivity estimate that
                               2101                 :  * makes use of both per-column and multi-column statistics.
                               2102                 :  *
                               2103                 :  * Additionally, we return information to help compute the overall selectivity
                               2104                 :  * of the ORed list of clauses assumed to contain this clause.  This function
                               2105                 :  * is intended to be called for each clause in the ORed list of clauses,
                               2106                 :  * allowing the overall selectivity to be computed using the following
                               2107                 :  * algorithm:
                               2108                 :  *
                               2109                 :  * Suppose P[n] = P(C[1] OR C[2] OR ... OR C[n]) is the combined selectivity
                               2110                 :  * of the first n clauses in the list.  Then the combined selectivity taking
                               2111                 :  * into account the next clause C[n+1] can be written as
                               2112                 :  *
                               2113                 :  *      P[n+1] = P[n] + P(C[n+1]) - P((C[1] OR ... OR C[n]) AND C[n+1])
                               2114                 :  *
                               2115                 :  * The final term above represents the overlap between the clauses examined so
                               2116                 :  * far and the (n+1)'th clause.  To estimate its selectivity, we track the
                               2117                 :  * match bitmap for the ORed list of clauses examined so far and examine its
                               2118                 :  * intersection with the match bitmap for the (n+1)'th clause.
                               2119                 :  *
                               2120                 :  * We then also return the sums of the MCV item frequencies and base
                               2121                 :  * frequencies for the match bitmap intersection corresponding to the overlap
                               2122                 :  * term above, so that they can be combined with a simple selectivity estimate
                               2123                 :  * for that term.
                               2124                 :  *
                               2125                 :  * The parameter "or_matches" is an in/out parameter tracking the match bitmap
                               2126                 :  * for the clauses examined so far.  The caller is expected to set it to NULL
                               2127                 :  * the first time it calls this function.
  857 dean.a.rasheed           2128 ECB             :  */
                               2129                 : Selectivity
  857 dean.a.rasheed           2130 GIC         120 : mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
                               2131                 :                           MCVList *mcv, Node *clause, bool **or_matches,
                               2132                 :                           Selectivity *basesel, Selectivity *overlap_mcvsel,
  857 dean.a.rasheed           2133 ECB             :                           Selectivity *overlap_basesel, Selectivity *totalsel)
                               2134                 : {
  857 dean.a.rasheed           2135 GIC         120 :     Selectivity s = 0.0;
                               2136                 :     bool       *new_matches;
                               2137                 :     int         i;
  857 dean.a.rasheed           2138 ECB             : 
                               2139                 :     /* build the OR-matches bitmap, if not built already */
  857 dean.a.rasheed           2140 GIC         120 :     if (*or_matches == NULL)
                               2141              48 :         *or_matches = palloc0(sizeof(bool) * mcv->nitems);
  857 dean.a.rasheed           2142 ECB             : 
                               2143                 :     /* build the match bitmap for the new clause */
  857 dean.a.rasheed           2144 GIC         120 :     new_matches = mcv_get_match_bitmap(root, list_make1(clause), stat->keys,
                               2145                 :                                        stat->exprs, mcv, false);
                               2146                 : 
                               2147                 :     /*
                               2148                 :      * Sum the frequencies for all the MCV items matching this clause and also
                               2149                 :      * those matching the overlap between this clause and any of the preceding
  857 dean.a.rasheed           2150 ECB             :      * clauses as described above.
                               2151                 :      */
  857 dean.a.rasheed           2152 CBC         120 :     *basesel = 0.0;
                               2153             120 :     *overlap_mcvsel = 0.0;
                               2154             120 :     *overlap_basesel = 0.0;
  857 dean.a.rasheed           2155 GIC         120 :     *totalsel = 0.0;
  857 dean.a.rasheed           2156 CBC        7758 :     for (i = 0; i < mcv->nitems; i++)
                               2157                 :     {
                               2158            7638 :         *totalsel += mcv->items[i].frequency;
                               2159                 : 
                               2160            7638 :         if (new_matches[i])
  857 dean.a.rasheed           2161 ECB             :         {
  857 dean.a.rasheed           2162 GIC         168 :             s += mcv->items[i].frequency;
  857 dean.a.rasheed           2163 CBC         168 :             *basesel += mcv->items[i].base_frequency;
                               2164                 : 
                               2165             168 :             if ((*or_matches)[i])
  857 dean.a.rasheed           2166 ECB             :             {
  857 dean.a.rasheed           2167 GIC          72 :                 *overlap_mcvsel += mcv->items[i].frequency;
                               2168              72 :                 *overlap_basesel += mcv->items[i].base_frequency;
                               2169                 :             }
                               2170                 :         }
  857 dean.a.rasheed           2171 ECB             : 
                               2172                 :         /* update the OR-matches bitmap for the next clause */
  857 dean.a.rasheed           2173 GIC        7638 :         (*or_matches)[i] = (*or_matches)[i] || new_matches[i];
  857 dean.a.rasheed           2174 ECB             :     }
                               2175                 : 
  857 dean.a.rasheed           2176 CBC         120 :     pfree(new_matches);
                               2177                 : 
  857 dean.a.rasheed           2178 GIC         120 :     return s;
                               2179                 : }

Generated by: LCOV version v1.16-55-g56c0a2a