LCOV - differential code coverage report
Current view: top level - src/backend/utils/adt - rangetypes_typanalyze.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 93.4 % 152 142 1 6 3 1 89 52 6 88 1
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 5 5 5 5
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * rangetypes_typanalyze.c
       4                 :  *    Functions for gathering statistics from range columns
       5                 :  *
       6                 :  * For a range type column, histograms of lower and upper bounds, and
       7                 :  * the fraction of NULL and empty ranges are collected.
       8                 :  *
       9                 :  * Both histograms have the same length, and they are combined into a
      10                 :  * single array of ranges. This has the same shape as the histogram that
      11                 :  * std_typanalyze would collect, but the values are different. Each range
      12                 :  * in the array is a valid range, even though the lower and upper bounds
      13                 :  * come from different tuples. In theory, the standard scalar selectivity
      14                 :  * functions could be used with the combined histogram.
      15                 :  *
      16                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
      17                 :  * Portions Copyright (c) 1994, Regents of the University of California
      18                 :  *
      19                 :  *
      20                 :  * IDENTIFICATION
      21                 :  *    src/backend/utils/adt/rangetypes_typanalyze.c
      22                 :  *
      23                 :  *-------------------------------------------------------------------------
      24                 :  */
      25                 : #include "postgres.h"
      26                 : 
      27                 : #include "catalog/pg_operator.h"
      28                 : #include "commands/vacuum.h"
      29                 : #include "utils/float.h"
      30                 : #include "utils/fmgrprotos.h"
      31                 : #include "utils/lsyscache.h"
      32                 : #include "utils/rangetypes.h"
      33                 : #include "utils/multirangetypes.h"
      34                 : #include "varatt.h"
      35                 : 
      36                 : static int  float8_qsort_cmp(const void *a1, const void *a2, void *arg);
      37                 : static int  range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
      38                 : static void compute_range_stats(VacAttrStats *stats,
      39                 :                                 AnalyzeAttrFetchFunc fetchfunc, int samplerows,
      40                 :                                 double totalrows);
      41                 : 
      42                 : /*
      43                 :  * range_typanalyze -- typanalyze function for range columns
      44                 :  */
      45                 : Datum
      46 GIC          10 : range_typanalyze(PG_FUNCTION_ARGS)
      47 ECB             : {
      48 GIC          10 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      49 ECB             :     TypeCacheEntry *typcache;
      50 GIC          10 :     Form_pg_attribute attr = stats->attr;
      51 ECB             : 
      52                 :     /* Get information about range type; note column might be a domain */
      53 GIC          10 :     typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      54 ECB             : 
      55 GIC          10 :     if (attr->attstattarget < 0)
      56 CBC          10 :         attr->attstattarget = default_statistics_target;
      57 ECB             : 
      58 GIC          10 :     stats->compute_stats = compute_range_stats;
      59 CBC          10 :     stats->extra_data = typcache;
      60 ECB             :     /* same as in std_typanalyze */
      61 GIC          10 :     stats->minrows = 300 * attr->attstattarget;
      62 ECB             : 
      63 GIC          10 :     PG_RETURN_BOOL(true);
      64 ECB             : }
      65                 : 
      66                 : /*
      67                 :  * multirange_typanalyze -- typanalyze function for multirange columns
      68                 :  *
      69                 :  * We do the same analysis as for ranges, but on the smallest range that
      70                 :  * completely includes the multirange.
      71                 :  */
      72                 : Datum
      73 GIC           6 : multirange_typanalyze(PG_FUNCTION_ARGS)
      74 ECB             : {
      75 GIC           6 :     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
      76 ECB             :     TypeCacheEntry *typcache;
      77 GIC           6 :     Form_pg_attribute attr = stats->attr;
      78 ECB             : 
      79                 :     /* Get information about multirange type; note column might be a domain */
      80 GIC           6 :     typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
      81 ECB             : 
      82 GIC           6 :     if (attr->attstattarget < 0)
      83 CBC           6 :         attr->attstattarget = default_statistics_target;
      84 ECB             : 
      85 GIC           6 :     stats->compute_stats = compute_range_stats;
      86 CBC           6 :     stats->extra_data = typcache;
      87 ECB             :     /* same as in std_typanalyze */
      88 GIC           6 :     stats->minrows = 300 * attr->attstattarget;
      89 ECB             : 
      90 GIC           6 :     PG_RETURN_BOOL(true);
      91 ECB             : }
      92                 : 
      93                 : /*
      94                 :  * Comparison function for sorting float8s, used for range lengths.
      95                 :  */
      96                 : static int
      97 GIC       76940 : float8_qsort_cmp(const void *a1, const void *a2, void *arg)
      98 ECB             : {
      99 GIC       76940 :     const float8 *f1 = (const float8 *) a1;
     100 CBC       76940 :     const float8 *f2 = (const float8 *) a2;
     101 ECB             : 
     102 GIC       76940 :     if (*f1 < *f2)
     103 CBC          46 :         return -1;
     104           76894 :     else if (*f1 == *f2)
     105           68443 :         return 0;
     106 ECB             :     else
     107 GIC        8451 :         return 1;
     108 ECB             : }
     109                 : 
     110                 : /*
     111                 :  * Comparison function for sorting RangeBounds.
     112                 :  */
     113                 : static int
     114 GIC     1226167 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
     115 ECB             : {
     116 GIC     1226167 :     RangeBound *b1 = (RangeBound *) a1;
     117 CBC     1226167 :     RangeBound *b2 = (RangeBound *) a2;
     118         1226167 :     TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
     119 ECB             : 
     120 GIC     1226167 :     return range_cmp_bounds(typcache, b1, b2);
     121 ECB             : }
     122                 : 
     123                 : /*
     124                 :  * compute_range_stats() -- compute statistics for a range column
     125                 :  */
     126                 : static void
     127 GIC          16 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
     128 ECB             :                     int samplerows, double totalrows)
     129                 : {
     130 GIC          16 :     TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
     131 CBC          16 :     TypeCacheEntry *mltrng_typcache = NULL;
     132 ECB             :     bool        has_subdiff;
     133 GIC          16 :     int         null_cnt = 0;
     134 CBC          16 :     int         non_null_cnt = 0;
     135              16 :     int         non_empty_cnt = 0;
     136              16 :     int         empty_cnt = 0;
     137 ECB             :     int         range_no;
     138                 :     int         slot_idx;
     139 GIC          16 :     int         num_bins = stats->attr->attstattarget;
     140 ECB             :     int         num_hist;
     141                 :     float8     *lengths;
     142                 :     RangeBound *lowers,
     143                 :                *uppers;
     144 GIC          16 :     double      total_width = 0;
     145 ECB             : 
     146 GIC          16 :     if (typcache->typtype == TYPTYPE_MULTIRANGE)
     147 ECB             :     {
     148 GIC           6 :         mltrng_typcache = typcache;
     149 CBC           6 :         typcache = typcache->rngtype;
     150 ECB             :     }
     151                 :     else
     152 GIC          10 :         Assert(typcache->typtype == TYPTYPE_RANGE);
     153 CBC          16 :     has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
     154 ECB             : 
     155                 :     /* Allocate memory to hold range bounds and lengths of the sample ranges. */
     156 GIC          16 :     lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     157 CBC          16 :     uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
     158              16 :     lengths = (float8 *) palloc(sizeof(float8) * samplerows);
     159 ECB             : 
     160                 :     /* Loop over the sample ranges. */
     161 GIC       54567 :     for (range_no = 0; range_no < samplerows; range_no++)
     162 ECB             :     {
     163                 :         Datum       value;
     164                 :         bool        isnull,
     165                 :                     empty;
     166                 :         MultirangeType *multirange;
     167                 :         RangeType  *range;
     168                 :         RangeBound  lower,
     169                 :                     upper;
     170                 :         float8      length;
     171                 : 
     172 GIC       54551 :         vacuum_delay_point();
     173 ECB             : 
     174 GIC       54551 :         value = fetchfunc(stats, range_no, &isnull);
     175 CBC       54551 :         if (isnull)
     176 ECB             :         {
     177                 :             /* range is null, just count that */
     178 UIC           0 :             null_cnt++;
     179 UBC           0 :             continue;
     180 EUB             :         }
     181                 : 
     182                 :         /*
     183                 :          * XXX: should we ignore wide values, like std_typanalyze does, to
     184                 :          * avoid bloating the statistics table?
     185                 :          */
     186 GIC       54551 :         total_width += VARSIZE_ANY(DatumGetPointer(value));
     187 ECB             : 
     188                 :         /* Get range and deserialize it for further analysis. */
     189 GIC       54551 :         if (mltrng_typcache != NULL)
     190 ECB             :         {
     191                 :             /* Treat multiranges like a big range without gaps. */
     192 GIC       11133 :             multirange = DatumGetMultirangeTypeP(value);
     193 CBC       11133 :             if (!MultirangeIsEmpty(multirange))
     194 ECB             :             {
     195                 :                 RangeBound  tmp;
     196                 : 
     197 GIC        9621 :                 multirange_get_bounds(typcache, multirange, 0,
     198 ECB             :                                       &lower, &tmp);
     199 GIC        9621 :                 multirange_get_bounds(typcache, multirange,
     200 CBC        9621 :                                       multirange->rangeCount - 1,
     201 ECB             :                                       &tmp, &upper);
     202 GIC        9621 :                 empty = false;
     203 ECB             :             }
     204                 :             else
     205                 :             {
     206 GIC        1512 :                 empty = true;
     207 ECB             :             }
     208                 :         }
     209                 :         else
     210                 :         {
     211 GIC       43418 :             range = DatumGetRangeTypeP(value);
     212 CBC       43418 :             range_deserialize(typcache, range, &lower, &upper, &empty);
     213 ECB             :         }
     214                 : 
     215 GIC       54551 :         if (!empty)
     216 ECB             :         {
     217                 :             /* Remember bounds and length for further usage in histograms */
     218 GIC       46036 :             lowers[non_empty_cnt] = lower;
     219 CBC       46036 :             uppers[non_empty_cnt] = upper;
     220 ECB             : 
     221 GIC       46036 :             if (lower.infinite || upper.infinite)
     222 ECB             :             {
     223                 :                 /* Length of any kind of an infinite range is infinite */
     224 GIC        2021 :                 length = get_float8_infinity();
     225 ECB             :             }
     226 GIC       44015 :             else if (has_subdiff)
     227 ECB             :             {
     228                 :                 /*
     229                 :                  * For an ordinary range, use subdiff function between upper
     230                 :                  * and lower bound values.
     231                 :                  */
     232 GIC       44015 :                 length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
     233 ECB             :                                                           typcache->rng_collation,
     234                 :                                                           upper.val, lower.val));
     235                 :             }
     236                 :             else
     237                 :             {
     238                 :                 /* Use default value of 1.0 if no subdiff is available. */
     239 UIC           0 :                 length = 1.0;
     240 EUB             :             }
     241 GIC       46036 :             lengths[non_empty_cnt] = length;
     242 ECB             : 
     243 GIC       46036 :             non_empty_cnt++;
     244 ECB             :         }
     245                 :         else
     246 GIC        8515 :             empty_cnt++;
     247 ECB             : 
     248 GIC       54551 :         non_null_cnt++;
     249 ECB             :     }
     250                 : 
     251 GIC          16 :     slot_idx = 0;
     252 ECB             : 
     253                 :     /* We can only compute real stats if we found some non-null values. */
     254 GIC          16 :     if (non_null_cnt > 0)
     255 ECB             :     {
     256                 :         Datum      *bound_hist_values;
     257                 :         Datum      *length_hist_values;
     258                 :         int         pos,
     259                 :                     posfrac,
     260                 :                     delta,
     261                 :                     deltafrac,
     262                 :                     i;
     263                 :         MemoryContext old_cxt;
     264                 :         float4     *emptyfrac;
     265                 : 
     266 GIC          16 :         stats->stats_valid = true;
     267 ECB             :         /* Do the simple null-frac and width stats */
     268 GIC          16 :         stats->stanullfrac = (double) null_cnt / (double) samplerows;
     269 CBC          16 :         stats->stawidth = total_width / (double) non_null_cnt;
     270 ECB             : 
     271                 :         /* Estimate that non-null values are unique */
     272 GIC          16 :         stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
     273 ECB             : 
     274                 :         /* Must copy the target values into anl_context */
     275 GIC          16 :         old_cxt = MemoryContextSwitchTo(stats->anl_context);
     276 ECB             : 
     277                 :         /*
     278                 :          * Generate a bounds histogram slot entry if there are at least two
     279                 :          * values.
     280                 :          */
     281 GIC          16 :         if (non_empty_cnt >= 2)
     282 ECB             :         {
     283                 :             /* Sort bound values */
     284 GIC          16 :             qsort_interruptible(lowers, non_empty_cnt, sizeof(RangeBound),
     285 ECB             :                                 range_bound_qsort_cmp, typcache);
     286 GIC          16 :             qsort_interruptible(uppers, non_empty_cnt, sizeof(RangeBound),
     287 ECB             :                                 range_bound_qsort_cmp, typcache);
     288                 : 
     289 GIC          16 :             num_hist = non_empty_cnt;
     290 CBC          16 :             if (num_hist > num_bins)
     291              10 :                 num_hist = num_bins + 1;
     292 ECB             : 
     293 GIC          16 :             bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     294 ECB             : 
     295                 :             /*
     296                 :              * The object of this loop is to construct ranges from first and
     297                 :              * last entries in lowers[] and uppers[] along with evenly-spaced
     298                 :              * values in between. So the i'th value is a range of lowers[(i *
     299                 :              * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
     300                 :              * (num_hist - 1)]. But computing that subscript directly risks
     301                 :              * integer overflow when the stats target is more than a couple
     302                 :              * thousand.  Instead we add (nvals - 1) / (num_hist - 1) to pos
     303                 :              * at each step, tracking the integral and fractional parts of the
     304                 :              * sum separately.
     305                 :              */
     306 GIC          16 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     307 CBC          16 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     308              16 :             pos = posfrac = 0;
     309 ECB             : 
     310 GIC        1062 :             for (i = 0; i < num_hist; i++)
     311 ECB             :             {
     312 GIC        1046 :                 bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
     313 CBC        1046 :                                                                        &lowers[pos],
     314            1046 :                                                                        &uppers[pos],
     315                 :                                                                        false,
     316                 :                                                                        NULL));
     317 GIC        1046 :                 pos += delta;
     318            1046 :                 posfrac += deltafrac;
     319 CBC        1046 :                 if (posfrac >= (num_hist - 1))
     320 ECB             :                 {
     321                 :                     /* fractional part exceeds 1, carry to integer part */
     322 GIC         990 :                     pos++;
     323             990 :                     posfrac -= (num_hist - 1);
     324 ECB             :                 }
     325                 :             }
     326                 : 
     327 GIC          16 :             stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
     328              16 :             stats->stavalues[slot_idx] = bound_hist_values;
     329 CBC          16 :             stats->numvalues[slot_idx] = num_hist;
     330 ECB             : 
     331                 :             /* Store ranges even if we're analyzing a multirange column */
     332 GIC          16 :             stats->statypid[slot_idx] = typcache->type_id;
     333              16 :             stats->statyplen[slot_idx] = typcache->typlen;
     334 CBC          16 :             stats->statypbyval[slot_idx] = typcache->typbyval;
     335              16 :             stats->statypalign[slot_idx] = typcache->typalign;
     336 ECB             : 
     337 CBC          16 :             slot_idx++;
     338                 :         }
     339 ECB             : 
     340                 :         /*
     341                 :          * Generate a length histogram slot entry if there are at least two
     342                 :          * values.
     343                 :          */
     344 GIC          16 :         if (non_empty_cnt >= 2)
     345                 :         {
     346 ECB             :             /*
     347                 :              * Ascending sort of range lengths for further filling of
     348                 :              * histogram
     349                 :              */
     350 GIC          16 :             qsort_interruptible(lengths, non_empty_cnt, sizeof(float8),
     351                 :                                 float8_qsort_cmp, NULL);
     352 ECB             : 
     353 GIC          16 :             num_hist = non_empty_cnt;
     354              16 :             if (num_hist > num_bins)
     355 CBC          10 :                 num_hist = num_bins + 1;
     356 ECB             : 
     357 CBC          16 :             length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
     358                 : 
     359 ECB             :             /*
     360                 :              * The object of this loop is to copy the first and last lengths[]
     361                 :              * entries along with evenly-spaced values in between. So the i'th
     362                 :              * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
     363                 :              * computing that subscript directly risks integer overflow when
     364                 :              * the stats target is more than a couple thousand.  Instead we
     365                 :              * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
     366                 :              * the integral and fractional parts of the sum separately.
     367                 :              */
     368 GIC          16 :             delta = (non_empty_cnt - 1) / (num_hist - 1);
     369              16 :             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
     370 CBC          16 :             pos = posfrac = 0;
     371 ECB             : 
     372 CBC        1062 :             for (i = 0; i < num_hist; i++)
     373                 :             {
     374            1046 :                 length_hist_values[i] = Float8GetDatum(lengths[pos]);
     375 GIC        1046 :                 pos += delta;
     376 CBC        1046 :                 posfrac += deltafrac;
     377            1046 :                 if (posfrac >= (num_hist - 1))
     378 ECB             :                 {
     379                 :                     /* fractional part exceeds 1, carry to integer part */
     380 GIC         990 :                     pos++;
     381             990 :                     posfrac -= (num_hist - 1);
     382 ECB             :                 }
     383                 :             }
     384                 :         }
     385                 :         else
     386                 :         {
     387                 :             /*
     388                 :              * Even when we don't create the histogram, store an empty array
     389                 :              * to mean "no histogram". We can't just leave stavalues NULL,
     390                 :              * because get_attstatsslot() errors if you ask for stavalues, and
     391                 :              * it's NULL. We'll still store the empty fraction in stanumbers.
     392                 :              */
     393 UIC           0 :             length_hist_values = palloc(0);
     394               0 :             num_hist = 0;
     395 EUB             :         }
     396 GBC          16 :         stats->staop[slot_idx] = Float8LessOperator;
     397 GIC          16 :         stats->stacoll[slot_idx] = InvalidOid;
     398 CBC          16 :         stats->stavalues[slot_idx] = length_hist_values;
     399              16 :         stats->numvalues[slot_idx] = num_hist;
     400              16 :         stats->statypid[slot_idx] = FLOAT8OID;
     401              16 :         stats->statyplen[slot_idx] = sizeof(float8);
     402              16 :         stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
     403              16 :         stats->statypalign[slot_idx] = 'd';
     404 ECB             : 
     405                 :         /* Store the fraction of empty ranges */
     406 GIC          16 :         emptyfrac = (float4 *) palloc(sizeof(float4));
     407              16 :         *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
     408 CBC          16 :         stats->stanumbers[slot_idx] = emptyfrac;
     409              16 :         stats->numnumbers[slot_idx] = 1;
     410 ECB             : 
     411 CBC          16 :         stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
     412 GIC          16 :         slot_idx++;
     413 ECB             : 
     414 CBC          16 :         MemoryContextSwitchTo(old_cxt);
     415                 :     }
     416 LBC           0 :     else if (null_cnt > 0)
     417                 :     {
     418 EUB             :         /* We found only nulls; assume the column is entirely null */
     419 UIC           0 :         stats->stats_valid = true;
     420               0 :         stats->stanullfrac = 1.0;
     421 UBC           0 :         stats->stawidth = 0; /* "unknown" */
     422               0 :         stats->stadistinct = 0.0;    /* "unknown" */
     423 EUB             :     }
     424                 : 
     425                 :     /*
     426                 :      * We don't need to bother cleaning up any of our temporary palloc's. The
     427                 :      * hashtable should also go away, as it used a child memory context.
     428                 :      */
     429 GIC          16 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a