TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * rangetypes_typanalyze.c
4 : * Functions for gathering statistics from range columns
5 : *
6 : * For a range type column, histograms of lower and upper bounds, and
7 : * the fraction of NULL and empty ranges are collected.
8 : *
9 : * Both histograms have the same length, and they are combined into a
10 : * single array of ranges. This has the same shape as the histogram that
11 : * std_typanalyze would collect, but the values are different. Each range
12 : * in the array is a valid range, even though the lower and upper bounds
13 : * come from different tuples. In theory, the standard scalar selectivity
14 : * functions could be used with the combined histogram.
15 : *
16 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
17 : * Portions Copyright (c) 1994, Regents of the University of California
18 : *
19 : *
20 : * IDENTIFICATION
21 : * src/backend/utils/adt/rangetypes_typanalyze.c
22 : *
23 : *-------------------------------------------------------------------------
24 : */
25 : #include "postgres.h"
26 :
27 : #include "catalog/pg_operator.h"
28 : #include "commands/vacuum.h"
29 : #include "utils/float.h"
30 : #include "utils/fmgrprotos.h"
31 : #include "utils/lsyscache.h"
32 : #include "utils/rangetypes.h"
33 : #include "utils/multirangetypes.h"
34 : #include "varatt.h"
35 :
36 : static int float8_qsort_cmp(const void *a1, const void *a2, void *arg);
37 : static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
38 : static void compute_range_stats(VacAttrStats *stats,
39 : AnalyzeAttrFetchFunc fetchfunc, int samplerows,
40 : double totalrows);
41 :
42 : /*
43 : * range_typanalyze -- typanalyze function for range columns
44 : */
45 : Datum
46 GIC 10 : range_typanalyze(PG_FUNCTION_ARGS)
47 ECB : {
48 GIC 10 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
49 ECB : TypeCacheEntry *typcache;
50 GIC 10 : Form_pg_attribute attr = stats->attr;
51 ECB :
52 : /* Get information about range type; note column might be a domain */
53 GIC 10 : typcache = range_get_typcache(fcinfo, getBaseType(stats->attrtypid));
54 ECB :
55 GIC 10 : if (attr->attstattarget < 0)
56 CBC 10 : attr->attstattarget = default_statistics_target;
57 ECB :
58 GIC 10 : stats->compute_stats = compute_range_stats;
59 CBC 10 : stats->extra_data = typcache;
60 ECB : /* same as in std_typanalyze */
61 GIC 10 : stats->minrows = 300 * attr->attstattarget;
62 ECB :
63 GIC 10 : PG_RETURN_BOOL(true);
64 ECB : }
65 :
66 : /*
67 : * multirange_typanalyze -- typanalyze function for multirange columns
68 : *
69 : * We do the same analysis as for ranges, but on the smallest range that
70 : * completely includes the multirange.
71 : */
72 : Datum
73 GIC 6 : multirange_typanalyze(PG_FUNCTION_ARGS)
74 ECB : {
75 GIC 6 : VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
76 ECB : TypeCacheEntry *typcache;
77 GIC 6 : Form_pg_attribute attr = stats->attr;
78 ECB :
79 : /* Get information about multirange type; note column might be a domain */
80 GIC 6 : typcache = multirange_get_typcache(fcinfo, getBaseType(stats->attrtypid));
81 ECB :
82 GIC 6 : if (attr->attstattarget < 0)
83 CBC 6 : attr->attstattarget = default_statistics_target;
84 ECB :
85 GIC 6 : stats->compute_stats = compute_range_stats;
86 CBC 6 : stats->extra_data = typcache;
87 ECB : /* same as in std_typanalyze */
88 GIC 6 : stats->minrows = 300 * attr->attstattarget;
89 ECB :
90 GIC 6 : PG_RETURN_BOOL(true);
91 ECB : }
92 :
93 : /*
94 : * Comparison function for sorting float8s, used for range lengths.
95 : */
96 : static int
97 GIC 76940 : float8_qsort_cmp(const void *a1, const void *a2, void *arg)
98 ECB : {
99 GIC 76940 : const float8 *f1 = (const float8 *) a1;
100 CBC 76940 : const float8 *f2 = (const float8 *) a2;
101 ECB :
102 GIC 76940 : if (*f1 < *f2)
103 CBC 46 : return -1;
104 76894 : else if (*f1 == *f2)
105 68443 : return 0;
106 ECB : else
107 GIC 8451 : return 1;
108 ECB : }
109 :
110 : /*
111 : * Comparison function for sorting RangeBounds.
112 : */
113 : static int
114 GIC 1226167 : range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
115 ECB : {
116 GIC 1226167 : RangeBound *b1 = (RangeBound *) a1;
117 CBC 1226167 : RangeBound *b2 = (RangeBound *) a2;
118 1226167 : TypeCacheEntry *typcache = (TypeCacheEntry *) arg;
119 ECB :
120 GIC 1226167 : return range_cmp_bounds(typcache, b1, b2);
121 ECB : }
122 :
123 : /*
124 : * compute_range_stats() -- compute statistics for a range column
125 : */
126 : static void
127 GIC 16 : compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
128 ECB : int samplerows, double totalrows)
129 : {
130 GIC 16 : TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
131 CBC 16 : TypeCacheEntry *mltrng_typcache = NULL;
132 ECB : bool has_subdiff;
133 GIC 16 : int null_cnt = 0;
134 CBC 16 : int non_null_cnt = 0;
135 16 : int non_empty_cnt = 0;
136 16 : int empty_cnt = 0;
137 ECB : int range_no;
138 : int slot_idx;
139 GIC 16 : int num_bins = stats->attr->attstattarget;
140 ECB : int num_hist;
141 : float8 *lengths;
142 : RangeBound *lowers,
143 : *uppers;
144 GIC 16 : double total_width = 0;
145 ECB :
146 GIC 16 : if (typcache->typtype == TYPTYPE_MULTIRANGE)
147 ECB : {
148 GIC 6 : mltrng_typcache = typcache;
149 CBC 6 : typcache = typcache->rngtype;
150 ECB : }
151 : else
152 GIC 10 : Assert(typcache->typtype == TYPTYPE_RANGE);
153 CBC 16 : has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
154 ECB :
155 : /* Allocate memory to hold range bounds and lengths of the sample ranges. */
156 GIC 16 : lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
157 CBC 16 : uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
158 16 : lengths = (float8 *) palloc(sizeof(float8) * samplerows);
159 ECB :
160 : /* Loop over the sample ranges. */
161 GIC 54567 : for (range_no = 0; range_no < samplerows; range_no++)
162 ECB : {
163 : Datum value;
164 : bool isnull,
165 : empty;
166 : MultirangeType *multirange;
167 : RangeType *range;
168 : RangeBound lower,
169 : upper;
170 : float8 length;
171 :
172 GIC 54551 : vacuum_delay_point();
173 ECB :
174 GIC 54551 : value = fetchfunc(stats, range_no, &isnull);
175 CBC 54551 : if (isnull)
176 ECB : {
177 : /* range is null, just count that */
178 UIC 0 : null_cnt++;
179 UBC 0 : continue;
180 EUB : }
181 :
182 : /*
183 : * XXX: should we ignore wide values, like std_typanalyze does, to
184 : * avoid bloating the statistics table?
185 : */
186 GIC 54551 : total_width += VARSIZE_ANY(DatumGetPointer(value));
187 ECB :
188 : /* Get range and deserialize it for further analysis. */
189 GIC 54551 : if (mltrng_typcache != NULL)
190 ECB : {
191 : /* Treat multiranges like a big range without gaps. */
192 GIC 11133 : multirange = DatumGetMultirangeTypeP(value);
193 CBC 11133 : if (!MultirangeIsEmpty(multirange))
194 ECB : {
195 : RangeBound tmp;
196 :
197 GIC 9621 : multirange_get_bounds(typcache, multirange, 0,
198 ECB : &lower, &tmp);
199 GIC 9621 : multirange_get_bounds(typcache, multirange,
200 CBC 9621 : multirange->rangeCount - 1,
201 ECB : &tmp, &upper);
202 GIC 9621 : empty = false;
203 ECB : }
204 : else
205 : {
206 GIC 1512 : empty = true;
207 ECB : }
208 : }
209 : else
210 : {
211 GIC 43418 : range = DatumGetRangeTypeP(value);
212 CBC 43418 : range_deserialize(typcache, range, &lower, &upper, &empty);
213 ECB : }
214 :
215 GIC 54551 : if (!empty)
216 ECB : {
217 : /* Remember bounds and length for further usage in histograms */
218 GIC 46036 : lowers[non_empty_cnt] = lower;
219 CBC 46036 : uppers[non_empty_cnt] = upper;
220 ECB :
221 GIC 46036 : if (lower.infinite || upper.infinite)
222 ECB : {
223 : /* Length of any kind of an infinite range is infinite */
224 GIC 2021 : length = get_float8_infinity();
225 ECB : }
226 GIC 44015 : else if (has_subdiff)
227 ECB : {
228 : /*
229 : * For an ordinary range, use subdiff function between upper
230 : * and lower bound values.
231 : */
232 GIC 44015 : length = DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
233 ECB : typcache->rng_collation,
234 : upper.val, lower.val));
235 : }
236 : else
237 : {
238 : /* Use default value of 1.0 if no subdiff is available. */
239 UIC 0 : length = 1.0;
240 EUB : }
241 GIC 46036 : lengths[non_empty_cnt] = length;
242 ECB :
243 GIC 46036 : non_empty_cnt++;
244 ECB : }
245 : else
246 GIC 8515 : empty_cnt++;
247 ECB :
248 GIC 54551 : non_null_cnt++;
249 ECB : }
250 :
251 GIC 16 : slot_idx = 0;
252 ECB :
253 : /* We can only compute real stats if we found some non-null values. */
254 GIC 16 : if (non_null_cnt > 0)
255 ECB : {
256 : Datum *bound_hist_values;
257 : Datum *length_hist_values;
258 : int pos,
259 : posfrac,
260 : delta,
261 : deltafrac,
262 : i;
263 : MemoryContext old_cxt;
264 : float4 *emptyfrac;
265 :
266 GIC 16 : stats->stats_valid = true;
267 ECB : /* Do the simple null-frac and width stats */
268 GIC 16 : stats->stanullfrac = (double) null_cnt / (double) samplerows;
269 CBC 16 : stats->stawidth = total_width / (double) non_null_cnt;
270 ECB :
271 : /* Estimate that non-null values are unique */
272 GIC 16 : stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac);
273 ECB :
274 : /* Must copy the target values into anl_context */
275 GIC 16 : old_cxt = MemoryContextSwitchTo(stats->anl_context);
276 ECB :
277 : /*
278 : * Generate a bounds histogram slot entry if there are at least two
279 : * values.
280 : */
281 GIC 16 : if (non_empty_cnt >= 2)
282 ECB : {
283 : /* Sort bound values */
284 GIC 16 : qsort_interruptible(lowers, non_empty_cnt, sizeof(RangeBound),
285 ECB : range_bound_qsort_cmp, typcache);
286 GIC 16 : qsort_interruptible(uppers, non_empty_cnt, sizeof(RangeBound),
287 ECB : range_bound_qsort_cmp, typcache);
288 :
289 GIC 16 : num_hist = non_empty_cnt;
290 CBC 16 : if (num_hist > num_bins)
291 10 : num_hist = num_bins + 1;
292 ECB :
293 GIC 16 : bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
294 ECB :
295 : /*
296 : * The object of this loop is to construct ranges from first and
297 : * last entries in lowers[] and uppers[] along with evenly-spaced
298 : * values in between. So the i'th value is a range of lowers[(i *
299 : * (nvals - 1)) / (num_hist - 1)] and uppers[(i * (nvals - 1)) /
300 : * (num_hist - 1)]. But computing that subscript directly risks
301 : * integer overflow when the stats target is more than a couple
302 : * thousand. Instead we add (nvals - 1) / (num_hist - 1) to pos
303 : * at each step, tracking the integral and fractional parts of the
304 : * sum separately.
305 : */
306 GIC 16 : delta = (non_empty_cnt - 1) / (num_hist - 1);
307 CBC 16 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
308 16 : pos = posfrac = 0;
309 ECB :
310 GIC 1062 : for (i = 0; i < num_hist; i++)
311 ECB : {
312 GIC 1046 : bound_hist_values[i] = PointerGetDatum(range_serialize(typcache,
313 CBC 1046 : &lowers[pos],
314 1046 : &uppers[pos],
315 : false,
316 : NULL));
317 GIC 1046 : pos += delta;
318 1046 : posfrac += deltafrac;
319 CBC 1046 : if (posfrac >= (num_hist - 1))
320 ECB : {
321 : /* fractional part exceeds 1, carry to integer part */
322 GIC 990 : pos++;
323 990 : posfrac -= (num_hist - 1);
324 ECB : }
325 : }
326 :
327 GIC 16 : stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
328 16 : stats->stavalues[slot_idx] = bound_hist_values;
329 CBC 16 : stats->numvalues[slot_idx] = num_hist;
330 ECB :
331 : /* Store ranges even if we're analyzing a multirange column */
332 GIC 16 : stats->statypid[slot_idx] = typcache->type_id;
333 16 : stats->statyplen[slot_idx] = typcache->typlen;
334 CBC 16 : stats->statypbyval[slot_idx] = typcache->typbyval;
335 16 : stats->statypalign[slot_idx] = typcache->typalign;
336 ECB :
337 CBC 16 : slot_idx++;
338 : }
339 ECB :
340 : /*
341 : * Generate a length histogram slot entry if there are at least two
342 : * values.
343 : */
344 GIC 16 : if (non_empty_cnt >= 2)
345 : {
346 ECB : /*
347 : * Ascending sort of range lengths for further filling of
348 : * histogram
349 : */
350 GIC 16 : qsort_interruptible(lengths, non_empty_cnt, sizeof(float8),
351 : float8_qsort_cmp, NULL);
352 ECB :
353 GIC 16 : num_hist = non_empty_cnt;
354 16 : if (num_hist > num_bins)
355 CBC 10 : num_hist = num_bins + 1;
356 ECB :
357 CBC 16 : length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
358 :
359 ECB : /*
360 : * The object of this loop is to copy the first and last lengths[]
361 : * entries along with evenly-spaced values in between. So the i'th
362 : * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
363 : * computing that subscript directly risks integer overflow when
364 : * the stats target is more than a couple thousand. Instead we
365 : * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking
366 : * the integral and fractional parts of the sum separately.
367 : */
368 GIC 16 : delta = (non_empty_cnt - 1) / (num_hist - 1);
369 16 : deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
370 CBC 16 : pos = posfrac = 0;
371 ECB :
372 CBC 1062 : for (i = 0; i < num_hist; i++)
373 : {
374 1046 : length_hist_values[i] = Float8GetDatum(lengths[pos]);
375 GIC 1046 : pos += delta;
376 CBC 1046 : posfrac += deltafrac;
377 1046 : if (posfrac >= (num_hist - 1))
378 ECB : {
379 : /* fractional part exceeds 1, carry to integer part */
380 GIC 990 : pos++;
381 990 : posfrac -= (num_hist - 1);
382 ECB : }
383 : }
384 : }
385 : else
386 : {
387 : /*
388 : * Even when we don't create the histogram, store an empty array
389 : * to mean "no histogram". We can't just leave stavalues NULL,
390 : * because get_attstatsslot() errors if you ask for stavalues, and
391 : * it's NULL. We'll still store the empty fraction in stanumbers.
392 : */
393 UIC 0 : length_hist_values = palloc(0);
394 0 : num_hist = 0;
395 EUB : }
396 GBC 16 : stats->staop[slot_idx] = Float8LessOperator;
397 GIC 16 : stats->stacoll[slot_idx] = InvalidOid;
398 CBC 16 : stats->stavalues[slot_idx] = length_hist_values;
399 16 : stats->numvalues[slot_idx] = num_hist;
400 16 : stats->statypid[slot_idx] = FLOAT8OID;
401 16 : stats->statyplen[slot_idx] = sizeof(float8);
402 16 : stats->statypbyval[slot_idx] = FLOAT8PASSBYVAL;
403 16 : stats->statypalign[slot_idx] = 'd';
404 ECB :
405 : /* Store the fraction of empty ranges */
406 GIC 16 : emptyfrac = (float4 *) palloc(sizeof(float4));
407 16 : *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
408 CBC 16 : stats->stanumbers[slot_idx] = emptyfrac;
409 16 : stats->numnumbers[slot_idx] = 1;
410 ECB :
411 CBC 16 : stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
412 GIC 16 : slot_idx++;
413 ECB :
414 CBC 16 : MemoryContextSwitchTo(old_cxt);
415 : }
416 LBC 0 : else if (null_cnt > 0)
417 : {
418 EUB : /* We found only nulls; assume the column is entirely null */
419 UIC 0 : stats->stats_valid = true;
420 0 : stats->stanullfrac = 1.0;
421 UBC 0 : stats->stawidth = 0; /* "unknown" */
422 0 : stats->stadistinct = 0.0; /* "unknown" */
423 EUB : }
424 :
425 : /*
426 : * We don't need to bother cleaning up any of our temporary palloc's. The
427 : * hashtable should also go away, as it used a child memory context.
428 : */
429 GIC 16 : }
|