Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * network_selfuncs.c
4 : : * Functions for selectivity estimation of inet/cidr operators
5 : : *
6 : : * This module provides estimators for the subnet inclusion and overlap
7 : : * operators. Estimates are based on null fraction, most common values,
8 : : * and histogram of inet/cidr columns.
9 : : *
10 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
11 : : * Portions Copyright (c) 1994, Regents of the University of California
12 : : *
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/utils/adt/network_selfuncs.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include <math.h>
22 : :
23 : : #include "access/htup_details.h"
24 : : #include "catalog/pg_operator.h"
25 : : #include "catalog/pg_statistic.h"
26 : : #include "utils/fmgrprotos.h"
27 : : #include "utils/inet.h"
28 : : #include "utils/lsyscache.h"
29 : : #include "utils/selfuncs.h"
30 : :
31 : :
32 : : /* Default selectivity for the inet overlap operator */
33 : : #define DEFAULT_OVERLAP_SEL 0.01
34 : :
35 : : /* Default selectivity for the various inclusion operators */
36 : : #define DEFAULT_INCLUSION_SEL 0.005
37 : :
38 : : /* Default selectivity for specified operator */
39 : : #define DEFAULT_SEL(operator) \
40 : : ((operator) == OID_INET_OVERLAP_OP ? \
41 : : DEFAULT_OVERLAP_SEL : DEFAULT_INCLUSION_SEL)
42 : :
43 : : /* Maximum number of items to consider in join selectivity calculations */
44 : : #define MAX_CONSIDERED_ELEMS 1024
45 : :
46 : : static Selectivity networkjoinsel_inner(Oid operator,
47 : : VariableStatData *vardata1, VariableStatData *vardata2);
48 : : static Selectivity networkjoinsel_semi(Oid operator,
49 : : VariableStatData *vardata1, VariableStatData *vardata2);
50 : : static Selectivity mcv_population(float4 *mcv_numbers, int mcv_nvalues);
51 : : static Selectivity inet_hist_value_sel(Datum *values, int nvalues,
52 : : Datum constvalue, int opr_codenum);
53 : : static Selectivity inet_mcv_join_sel(Datum *mcv1_values,
54 : : float4 *mcv1_numbers, int mcv1_nvalues, Datum *mcv2_values,
55 : : float4 *mcv2_numbers, int mcv2_nvalues, Oid operator);
56 : : static Selectivity inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers,
57 : : int mcv_nvalues, Datum *hist_values, int hist_nvalues,
58 : : int opr_codenum);
59 : : static Selectivity inet_hist_inclusion_join_sel(Datum *hist1_values,
60 : : int hist1_nvalues,
61 : : Datum *hist2_values, int hist2_nvalues,
62 : : int opr_codenum);
63 : : static Selectivity inet_semi_join_sel(Datum lhs_value,
64 : : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
65 : : bool hist_exists, Datum *hist_values, int hist_nvalues,
66 : : double hist_weight,
67 : : FmgrInfo *proc, int opr_codenum);
68 : : static int inet_opr_codenum(Oid operator);
69 : : static int inet_inclusion_cmp(inet *left, inet *right, int opr_codenum);
70 : : static int inet_masklen_inclusion_cmp(inet *left, inet *right,
71 : : int opr_codenum);
72 : : static int inet_hist_match_divider(inet *boundary, inet *query,
73 : : int opr_codenum);
74 : :
75 : : /*
76 : : * Selectivity estimation for the subnet inclusion/overlap operators
77 : : */
78 : : Datum
3659 tgl@sss.pgh.pa.us 79 :CBC 450 : networksel(PG_FUNCTION_ARGS)
80 : : {
3301 81 : 450 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
82 : 450 : Oid operator = PG_GETARG_OID(1);
83 : 450 : List *args = (List *) PG_GETARG_POINTER(2);
84 : 450 : int varRelid = PG_GETARG_INT32(3);
85 : : VariableStatData vardata;
86 : : Node *other;
87 : : bool varonleft;
88 : : Selectivity selec,
89 : : mcv_selec,
90 : : non_mcv_selec;
91 : : Datum constvalue;
92 : : Form_pg_statistic stats;
93 : : AttStatsSlot hslot;
94 : : double sumcommon,
95 : : nullfrac;
96 : : FmgrInfo proc;
97 : :
98 : : /*
99 : : * If expression is not (variable op something) or (something op
100 : : * variable), then punt and return a default estimate.
101 : : */
102 [ - + ]: 450 : if (!get_restriction_variable(root, args, varRelid,
103 : : &vardata, &other, &varonleft))
3301 tgl@sss.pgh.pa.us 104 [ # # ]:UBC 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
105 : :
106 : : /*
107 : : * Can't do anything useful if the something is not a constant, either.
108 : : */
3301 tgl@sss.pgh.pa.us 109 [ - + ]:CBC 450 : if (!IsA(other, Const))
110 : : {
3301 tgl@sss.pgh.pa.us 111 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
112 [ # # ]: 0 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
113 : : }
114 : :
115 : : /* All of the operators handled here are strict. */
3301 tgl@sss.pgh.pa.us 116 [ - + ]:CBC 450 : if (((Const *) other)->constisnull)
117 : : {
3301 tgl@sss.pgh.pa.us 118 [ # # ]:UBC 0 : ReleaseVariableStats(vardata);
119 : 0 : PG_RETURN_FLOAT8(0.0);
120 : : }
3301 tgl@sss.pgh.pa.us 121 :CBC 450 : constvalue = ((Const *) other)->constvalue;
122 : :
123 : : /* Otherwise, we need stats in order to produce a non-default estimate. */
124 [ + - ]: 450 : if (!HeapTupleIsValid(vardata.statsTuple))
125 : : {
126 [ - + ]: 450 : ReleaseVariableStats(vardata);
127 [ + + ]: 450 : PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
128 : : }
129 : :
3301 tgl@sss.pgh.pa.us 130 :UBC 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
131 : 0 : nullfrac = stats->stanullfrac;
132 : :
133 : : /*
134 : : * If we have most-common-values info, add up the fractions of the MCV
135 : : * entries that satisfy MCV OP CONST. These fractions contribute directly
136 : : * to the result selectivity. Also add up the total fraction represented
137 : : * by MCV entries.
138 : : */
139 : 0 : fmgr_info(get_opcode(operator), &proc);
1409 140 : 0 : mcv_selec = mcv_selectivity(&vardata, &proc, InvalidOid,
141 : : constvalue, varonleft,
142 : : &sumcommon);
143 : :
144 : : /*
145 : : * If we have a histogram, use it to estimate the proportion of the
146 : : * non-MCV population that satisfies the clause. If we don't, apply the
147 : : * default selectivity to that population.
148 : : */
2528 149 [ # # ]: 0 : if (get_attstatsslot(&hslot, vardata.statsTuple,
150 : : STATISTIC_KIND_HISTOGRAM, InvalidOid,
151 : : ATTSTATSSLOT_VALUES))
152 : : {
3301 153 : 0 : int opr_codenum = inet_opr_codenum(operator);
154 : :
155 : : /* Commute if needed, so we can consider histogram to be on the left */
156 [ # # ]: 0 : if (!varonleft)
157 : 0 : opr_codenum = -opr_codenum;
2528 158 : 0 : non_mcv_selec = inet_hist_value_sel(hslot.values, hslot.nvalues,
159 : : constvalue, opr_codenum);
160 : :
161 : 0 : free_attstatsslot(&hslot);
162 : : }
163 : : else
3301 164 [ # # ]: 0 : non_mcv_selec = DEFAULT_SEL(operator);
165 : :
166 : : /* Combine selectivities for MCV and non-MCV populations */
167 : 0 : selec = mcv_selec + (1.0 - nullfrac - sumcommon) * non_mcv_selec;
168 : :
169 : : /* Result should be in range, but make sure... */
170 [ # # # # ]: 0 : CLAMP_PROBABILITY(selec);
171 : :
172 [ # # ]: 0 : ReleaseVariableStats(vardata);
173 : :
174 : 0 : PG_RETURN_FLOAT8(selec);
175 : : }
176 : :
177 : : /*
178 : : * Join selectivity estimation for the subnet inclusion/overlap operators
179 : : *
180 : : * This function has the same structure as eqjoinsel() in selfuncs.c.
181 : : *
182 : : * Throughout networkjoinsel and its subroutines, we have a performance issue
183 : : * in that the amount of work to be done is O(N^2) in the length of the MCV
184 : : * and histogram arrays. To keep the runtime from getting out of hand when
185 : : * large statistics targets have been set, we arbitrarily limit the number of
186 : : * values considered to 1024 (MAX_CONSIDERED_ELEMS). For the MCV arrays, this
187 : : * is easy: just consider at most the first N elements. (Since the MCVs are
188 : : * sorted by decreasing frequency, this correctly gets us the first N MCVs.)
189 : : * For the histogram arrays, we decimate; that is consider only every k'th
190 : : * element, where k is chosen so that no more than MAX_CONSIDERED_ELEMS
191 : : * elements are considered. This should still give us a good random sample of
192 : : * the non-MCV population. Decimation is done on-the-fly in the loops that
193 : : * iterate over the histogram arrays.
194 : : */
195 : : Datum
3659 196 : 0 : networkjoinsel(PG_FUNCTION_ARGS)
197 : : {
3301 198 : 0 : PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
199 : 0 : Oid operator = PG_GETARG_OID(1);
200 : 0 : List *args = (List *) PG_GETARG_POINTER(2);
201 : : #ifdef NOT_USED
202 : : JoinType jointype = (JoinType) PG_GETARG_INT16(3);
203 : : #endif
204 : 0 : SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
205 : : double selec;
206 : : VariableStatData vardata1;
207 : : VariableStatData vardata2;
208 : : bool join_is_reversed;
209 : :
210 : 0 : get_join_variables(root, args, sjinfo,
211 : : &vardata1, &vardata2, &join_is_reversed);
212 : :
213 [ # # # ]: 0 : switch (sjinfo->jointype)
214 : : {
215 : 0 : case JOIN_INNER:
216 : : case JOIN_LEFT:
217 : : case JOIN_FULL:
218 : :
219 : : /*
220 : : * Selectivity for left/full join is not exactly the same as inner
221 : : * join, but we neglect the difference, as eqjoinsel does.
222 : : */
223 : 0 : selec = networkjoinsel_inner(operator, &vardata1, &vardata2);
224 : 0 : break;
225 : 0 : case JOIN_SEMI:
226 : : case JOIN_ANTI:
227 : : /* Here, it's important that we pass the outer var on the left. */
228 [ # # ]: 0 : if (!join_is_reversed)
229 : 0 : selec = networkjoinsel_semi(operator, &vardata1, &vardata2);
230 : : else
231 : 0 : selec = networkjoinsel_semi(get_commutator(operator),
232 : : &vardata2, &vardata1);
233 : 0 : break;
234 : 0 : default:
235 : : /* other values not expected here */
236 [ # # ]: 0 : elog(ERROR, "unrecognized join type: %d",
237 : : (int) sjinfo->jointype);
238 : : selec = 0; /* keep compiler quiet */
239 : : break;
240 : : }
241 : :
242 [ # # ]: 0 : ReleaseVariableStats(vardata1);
243 [ # # ]: 0 : ReleaseVariableStats(vardata2);
244 : :
245 [ # # # # ]: 0 : CLAMP_PROBABILITY(selec);
246 : :
247 : 0 : PG_RETURN_FLOAT8((float8) selec);
248 : : }
249 : :
250 : : /*
251 : : * Inner join selectivity estimation for subnet inclusion/overlap operators
252 : : *
253 : : * Calculates MCV vs MCV, MCV vs histogram and histogram vs histogram
254 : : * selectivity for join using the subnet inclusion operators. Unlike the
255 : : * join selectivity function for the equality operator, eqjoinsel_inner(),
256 : : * one to one matching of the values is not enough. Network inclusion
257 : : * operators are likely to match many to many, so we must check all pairs.
258 : : * (Note: it might be possible to exploit understanding of the histogram's
259 : : * btree ordering to reduce the work needed, but we don't currently try.)
260 : : * Also, MCV vs histogram selectivity is not neglected as in eqjoinsel_inner().
261 : : */
262 : : static Selectivity
263 : 0 : networkjoinsel_inner(Oid operator,
264 : : VariableStatData *vardata1, VariableStatData *vardata2)
265 : : {
266 : : Form_pg_statistic stats;
267 : 0 : double nullfrac1 = 0.0,
268 : 0 : nullfrac2 = 0.0;
269 : 0 : Selectivity selec = 0.0,
270 : 0 : sumcommon1 = 0.0,
271 : 0 : sumcommon2 = 0.0;
272 : 0 : bool mcv1_exists = false,
273 : 0 : mcv2_exists = false,
274 : 0 : hist1_exists = false,
275 : 0 : hist2_exists = false;
276 : : int opr_codenum;
2528 277 : 0 : int mcv1_length = 0,
3301 278 : 0 : mcv2_length = 0;
279 : : AttStatsSlot mcv1_slot;
280 : : AttStatsSlot mcv2_slot;
281 : : AttStatsSlot hist1_slot;
282 : : AttStatsSlot hist2_slot;
283 : :
284 [ # # ]: 0 : if (HeapTupleIsValid(vardata1->statsTuple))
285 : : {
286 : 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
287 : 0 : nullfrac1 = stats->stanullfrac;
288 : :
2528 289 : 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
290 : : STATISTIC_KIND_MCV, InvalidOid,
291 : : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
292 : 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
293 : : STATISTIC_KIND_HISTOGRAM, InvalidOid,
294 : : ATTSTATSSLOT_VALUES);
295 : : /* Arbitrarily limit number of MCVs considered */
296 : 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
3301 297 [ # # ]: 0 : if (mcv1_exists)
2528 298 : 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
299 : : }
300 : : else
301 : : {
302 : 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
303 : 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
304 : : }
305 : :
3301 306 [ # # ]: 0 : if (HeapTupleIsValid(vardata2->statsTuple))
307 : : {
308 : 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
309 : 0 : nullfrac2 = stats->stanullfrac;
310 : :
2528 311 : 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
312 : : STATISTIC_KIND_MCV, InvalidOid,
313 : : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
314 : 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
315 : : STATISTIC_KIND_HISTOGRAM, InvalidOid,
316 : : ATTSTATSSLOT_VALUES);
317 : : /* Arbitrarily limit number of MCVs considered */
318 : 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
3301 319 [ # # ]: 0 : if (mcv2_exists)
2528 320 : 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
321 : : }
322 : : else
323 : : {
324 : 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
325 : 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
326 : : }
327 : :
3301 328 : 0 : opr_codenum = inet_opr_codenum(operator);
329 : :
330 : : /*
331 : : * Calculate selectivity for MCV vs MCV matches.
332 : : */
333 [ # # # # ]: 0 : if (mcv1_exists && mcv2_exists)
2528 334 : 0 : selec += inet_mcv_join_sel(mcv1_slot.values, mcv1_slot.numbers,
335 : : mcv1_length,
336 : : mcv2_slot.values, mcv2_slot.numbers,
337 : : mcv2_length,
338 : : operator);
339 : :
340 : : /*
341 : : * Add in selectivities for MCV vs histogram matches, scaling according to
342 : : * the fractions of the populations represented by the histograms. Note
343 : : * that the second case needs to commute the operator.
344 : : */
3301 345 [ # # # # ]: 0 : if (mcv1_exists && hist2_exists)
346 : 0 : selec += (1.0 - nullfrac2 - sumcommon2) *
2528 347 : 0 : inet_mcv_hist_sel(mcv1_slot.values, mcv1_slot.numbers, mcv1_length,
348 : : hist2_slot.values, hist2_slot.nvalues,
349 : : opr_codenum);
3301 350 [ # # # # ]: 0 : if (mcv2_exists && hist1_exists)
351 : 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
2528 352 : 0 : inet_mcv_hist_sel(mcv2_slot.values, mcv2_slot.numbers, mcv2_length,
353 : : hist1_slot.values, hist1_slot.nvalues,
354 : : -opr_codenum);
355 : :
356 : : /*
357 : : * Add in selectivity for histogram vs histogram matches, again scaling
358 : : * appropriately.
359 : : */
3301 360 [ # # # # ]: 0 : if (hist1_exists && hist2_exists)
361 : 0 : selec += (1.0 - nullfrac1 - sumcommon1) *
362 : 0 : (1.0 - nullfrac2 - sumcommon2) *
2528 363 : 0 : inet_hist_inclusion_join_sel(hist1_slot.values, hist1_slot.nvalues,
364 : : hist2_slot.values, hist2_slot.nvalues,
365 : : opr_codenum);
366 : :
367 : : /*
368 : : * If useful statistics are not available then use the default estimate.
369 : : * We can apply null fractions if known, though.
370 : : */
3301 371 [ # # # # : 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
# # # # ]
372 [ # # ]: 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
373 : :
374 : : /* Release stats. */
2528 375 : 0 : free_attstatsslot(&mcv1_slot);
376 : 0 : free_attstatsslot(&mcv2_slot);
377 : 0 : free_attstatsslot(&hist1_slot);
378 : 0 : free_attstatsslot(&hist2_slot);
379 : :
3301 380 : 0 : return selec;
381 : : }
382 : :
383 : : /*
384 : : * Semi join selectivity estimation for subnet inclusion/overlap operators
385 : : *
386 : : * Calculates MCV vs MCV, MCV vs histogram, histogram vs MCV, and histogram vs
387 : : * histogram selectivity for semi/anti join cases.
388 : : */
389 : : static Selectivity
390 : 0 : networkjoinsel_semi(Oid operator,
391 : : VariableStatData *vardata1, VariableStatData *vardata2)
392 : : {
393 : : Form_pg_statistic stats;
394 : 0 : Selectivity selec = 0.0,
395 : 0 : sumcommon1 = 0.0,
396 : 0 : sumcommon2 = 0.0;
397 : 0 : double nullfrac1 = 0.0,
398 : 0 : nullfrac2 = 0.0,
399 : 0 : hist2_weight = 0.0;
400 : 0 : bool mcv1_exists = false,
401 : 0 : mcv2_exists = false,
402 : 0 : hist1_exists = false,
403 : 0 : hist2_exists = false;
404 : : int opr_codenum;
405 : : FmgrInfo proc;
406 : : int i,
407 : 0 : mcv1_length = 0,
408 : 0 : mcv2_length = 0;
409 : : AttStatsSlot mcv1_slot;
410 : : AttStatsSlot mcv2_slot;
411 : : AttStatsSlot hist1_slot;
412 : : AttStatsSlot hist2_slot;
413 : :
414 [ # # ]: 0 : if (HeapTupleIsValid(vardata1->statsTuple))
415 : : {
416 : 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
417 : 0 : nullfrac1 = stats->stanullfrac;
418 : :
2528 419 : 0 : mcv1_exists = get_attstatsslot(&mcv1_slot, vardata1->statsTuple,
420 : : STATISTIC_KIND_MCV, InvalidOid,
421 : : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
422 : 0 : hist1_exists = get_attstatsslot(&hist1_slot, vardata1->statsTuple,
423 : : STATISTIC_KIND_HISTOGRAM, InvalidOid,
424 : : ATTSTATSSLOT_VALUES);
425 : : /* Arbitrarily limit number of MCVs considered */
426 : 0 : mcv1_length = Min(mcv1_slot.nvalues, MAX_CONSIDERED_ELEMS);
3301 427 [ # # ]: 0 : if (mcv1_exists)
2528 428 : 0 : sumcommon1 = mcv_population(mcv1_slot.numbers, mcv1_length);
429 : : }
430 : : else
431 : : {
432 : 0 : memset(&mcv1_slot, 0, sizeof(mcv1_slot));
433 : 0 : memset(&hist1_slot, 0, sizeof(hist1_slot));
434 : : }
435 : :
3301 436 [ # # ]: 0 : if (HeapTupleIsValid(vardata2->statsTuple))
437 : : {
438 : 0 : stats = (Form_pg_statistic) GETSTRUCT(vardata2->statsTuple);
439 : 0 : nullfrac2 = stats->stanullfrac;
440 : :
2528 441 : 0 : mcv2_exists = get_attstatsslot(&mcv2_slot, vardata2->statsTuple,
442 : : STATISTIC_KIND_MCV, InvalidOid,
443 : : ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS);
444 : 0 : hist2_exists = get_attstatsslot(&hist2_slot, vardata2->statsTuple,
445 : : STATISTIC_KIND_HISTOGRAM, InvalidOid,
446 : : ATTSTATSSLOT_VALUES);
447 : : /* Arbitrarily limit number of MCVs considered */
448 : 0 : mcv2_length = Min(mcv2_slot.nvalues, MAX_CONSIDERED_ELEMS);
3301 449 [ # # ]: 0 : if (mcv2_exists)
2528 450 : 0 : sumcommon2 = mcv_population(mcv2_slot.numbers, mcv2_length);
451 : : }
452 : : else
453 : : {
454 : 0 : memset(&mcv2_slot, 0, sizeof(mcv2_slot));
455 : 0 : memset(&hist2_slot, 0, sizeof(hist2_slot));
456 : : }
457 : :
3301 458 : 0 : opr_codenum = inet_opr_codenum(operator);
459 : 0 : fmgr_info(get_opcode(operator), &proc);
460 : :
461 : : /* Estimate number of input rows represented by RHS histogram. */
462 [ # # # # ]: 0 : if (hist2_exists && vardata2->rel)
463 : 0 : hist2_weight = (1.0 - nullfrac2 - sumcommon2) * vardata2->rel->rows;
464 : :
465 : : /*
466 : : * Consider each element of the LHS MCV list, matching it to whatever RHS
467 : : * stats we have. Scale according to the known frequency of the MCV.
468 : : */
469 [ # # # # : 0 : if (mcv1_exists && (mcv2_exists || hist2_exists))
# # ]
470 : : {
471 [ # # ]: 0 : for (i = 0; i < mcv1_length; i++)
472 : : {
2528 473 : 0 : selec += mcv1_slot.numbers[i] *
474 : 0 : inet_semi_join_sel(mcv1_slot.values[i],
475 : : mcv2_exists, mcv2_slot.values, mcv2_length,
476 : : hist2_exists,
477 : : hist2_slot.values, hist2_slot.nvalues,
478 : : hist2_weight,
479 : : &proc, opr_codenum);
480 : : }
481 : : }
482 : :
483 : : /*
484 : : * Consider each element of the LHS histogram, except for the first and
485 : : * last elements, which we exclude on the grounds that they're outliers
486 : : * and thus not very representative. Scale on the assumption that each
487 : : * such histogram element represents an equal share of the LHS histogram
488 : : * population (which is a bit bogus, because the members of its bucket may
489 : : * not all act the same with respect to the join clause, but it's hard to
490 : : * do better).
491 : : *
492 : : * If there are too many histogram elements, decimate to limit runtime.
493 : : */
494 [ # # # # : 0 : if (hist1_exists && hist1_slot.nvalues > 2 && (mcv2_exists || hist2_exists))
# # # # ]
495 : : {
3301 496 : 0 : double hist_selec_sum = 0.0;
497 : : int k,
498 : : n;
499 : :
2528 500 : 0 : k = (hist1_slot.nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
501 : :
3301 502 : 0 : n = 0;
2528 503 [ # # ]: 0 : for (i = 1; i < hist1_slot.nvalues - 1; i += k)
504 : : {
3301 505 : 0 : hist_selec_sum +=
2528 506 : 0 : inet_semi_join_sel(hist1_slot.values[i],
507 : : mcv2_exists, mcv2_slot.values, mcv2_length,
508 : : hist2_exists,
509 : : hist2_slot.values, hist2_slot.nvalues,
510 : : hist2_weight,
511 : : &proc, opr_codenum);
3301 512 : 0 : n++;
513 : : }
514 : :
515 : 0 : selec += (1.0 - nullfrac1 - sumcommon1) * hist_selec_sum / n;
516 : : }
517 : :
518 : : /*
519 : : * If useful statistics are not available then use the default estimate.
520 : : * We can apply null fractions if known, though.
521 : : */
522 [ # # # # : 0 : if ((!mcv1_exists && !hist1_exists) || (!mcv2_exists && !hist2_exists))
# # # # ]
523 [ # # ]: 0 : selec = (1.0 - nullfrac1) * (1.0 - nullfrac2) * DEFAULT_SEL(operator);
524 : :
525 : : /* Release stats. */
2528 526 : 0 : free_attstatsslot(&mcv1_slot);
527 : 0 : free_attstatsslot(&mcv2_slot);
528 : 0 : free_attstatsslot(&hist1_slot);
529 : 0 : free_attstatsslot(&hist2_slot);
530 : :
3301 531 : 0 : return selec;
532 : : }
533 : :
534 : : /*
535 : : * Compute the fraction of a relation's population that is represented
536 : : * by the MCV list.
537 : : */
538 : : static Selectivity
539 : 0 : mcv_population(float4 *mcv_numbers, int mcv_nvalues)
540 : : {
541 : 0 : Selectivity sumcommon = 0.0;
542 : : int i;
543 : :
544 [ # # ]: 0 : for (i = 0; i < mcv_nvalues; i++)
545 : : {
546 : 0 : sumcommon += mcv_numbers[i];
547 : : }
548 : :
549 : 0 : return sumcommon;
550 : : }
551 : :
552 : : /*
553 : : * Inet histogram vs single value selectivity estimation
554 : : *
555 : : * Estimate the fraction of the histogram population that satisfies
556 : : * "value OPR CONST". (The result needs to be scaled to reflect the
557 : : * proportion of the total population represented by the histogram.)
558 : : *
559 : : * The histogram is originally for the inet btree comparison operators.
560 : : * Only the common bits of the network part and the length of the network part
561 : : * (masklen) are interesting for the subnet inclusion operators. Fortunately,
562 : : * btree comparison treats the network part as the major sort key. Even so,
563 : : * the length of the network part would not really be significant in the
564 : : * histogram. This would lead to big mistakes for data sets with uneven
565 : : * masklen distribution. To reduce this problem, comparisons with the left
566 : : * and the right sides of the buckets are used together.
567 : : *
568 : : * Histogram bucket matches are calculated in two forms. If the constant
569 : : * matches both bucket endpoints the bucket is considered as fully matched.
570 : : * The second form is to match the bucket partially; we recognize this when
571 : : * the constant matches just one endpoint, or the two endpoints fall on
572 : : * opposite sides of the constant. (Note that when the constant matches an
573 : : * interior histogram element, it gets credit for partial matches to the
574 : : * buckets on both sides, while a match to a histogram endpoint gets credit
575 : : * for only one partial match. This is desirable.)
576 : : *
577 : : * The divider in the partial bucket match is imagined as the distance
578 : : * between the decisive bits and the common bits of the addresses. It will
579 : : * be used as a power of two as it is the natural scale for the IP network
580 : : * inclusion. This partial bucket match divider calculation is an empirical
581 : : * formula and subject to change with more experiment.
582 : : *
583 : : * For a partial match, we try to calculate dividers for both of the
584 : : * boundaries. If the address family of a boundary value does not match the
585 : : * constant or comparison of the length of the network parts is not correct
586 : : * for the operator, the divider for that boundary will not be taken into
587 : : * account. If both of the dividers are valid, the greater one will be used
588 : : * to minimize the mistake in buckets that have disparate masklens. This
589 : : * calculation is unfair when dividers can be calculated for both of the
590 : : * boundaries but they are far from each other; but it is not a common
591 : : * situation as the boundaries are expected to share most of their significant
592 : : * bits of their masklens. The mistake would be greater, if we would use the
593 : : * minimum instead of the maximum, and we don't know a sensible way to combine
594 : : * them.
595 : : *
596 : : * For partial match in buckets that have different address families on the
597 : : * left and right sides, only the boundary with the same address family is
598 : : * taken into consideration. This can cause more mistakes for these buckets
599 : : * if the masklens of their boundaries are also disparate. But this can only
600 : : * happen in one bucket, since only two address families exist. It seems a
601 : : * better option than not considering these buckets at all.
602 : : */
603 : : static Selectivity
604 : 0 : inet_hist_value_sel(Datum *values, int nvalues, Datum constvalue,
605 : : int opr_codenum)
606 : : {
607 : 0 : Selectivity match = 0.0;
608 : : inet *query,
609 : : *left,
610 : : *right;
611 : : int i,
612 : : k,
613 : : n;
614 : : int left_order,
615 : : right_order,
616 : : left_divider,
617 : : right_divider;
618 : :
619 : : /* guard against zero-divide below */
620 [ # # ]: 0 : if (nvalues <= 1)
621 : 0 : return 0.0;
622 : :
623 : : /* if there are too many histogram elements, decimate to limit runtime */
624 : 0 : k = (nvalues - 2) / MAX_CONSIDERED_ELEMS + 1;
625 : :
626 : 0 : query = DatumGetInetPP(constvalue);
627 : :
628 : : /* "left" is the left boundary value of the current bucket ... */
629 : 0 : left = DatumGetInetPP(values[0]);
630 : 0 : left_order = inet_inclusion_cmp(left, query, opr_codenum);
631 : :
632 : 0 : n = 0;
633 [ # # ]: 0 : for (i = k; i < nvalues; i += k)
634 : : {
635 : : /* ... and "right" is the right boundary value */
636 : 0 : right = DatumGetInetPP(values[i]);
637 : 0 : right_order = inet_inclusion_cmp(right, query, opr_codenum);
638 : :
639 [ # # # # ]: 0 : if (left_order == 0 && right_order == 0)
640 : : {
641 : : /* The whole bucket matches, since both endpoints do. */
642 : 0 : match += 1.0;
643 : : }
644 [ # # # # : 0 : else if ((left_order <= 0 && right_order >= 0) ||
# # ]
645 [ # # ]: 0 : (left_order >= 0 && right_order <= 0))
646 : : {
647 : : /* Partial bucket match. */
648 : 0 : left_divider = inet_hist_match_divider(left, query, opr_codenum);
649 : 0 : right_divider = inet_hist_match_divider(right, query, opr_codenum);
650 : :
651 [ # # # # ]: 0 : if (left_divider >= 0 || right_divider >= 0)
652 : 0 : match += 1.0 / pow(2.0, Max(left_divider, right_divider));
653 : : }
654 : :
655 : : /* Shift the variables. */
656 : 0 : left = right;
657 : 0 : left_order = right_order;
658 : :
659 : : /* Count the number of buckets considered. */
660 : 0 : n++;
661 : : }
662 : :
663 : 0 : return match / n;
664 : : }
665 : :
666 : : /*
667 : : * Inet MCV vs MCV join selectivity estimation
668 : : *
669 : : * We simply add up the fractions of the populations that satisfy the clause.
670 : : * The result is exact and does not need to be scaled further.
671 : : */
672 : : static Selectivity
673 : 0 : inet_mcv_join_sel(Datum *mcv1_values, float4 *mcv1_numbers, int mcv1_nvalues,
674 : : Datum *mcv2_values, float4 *mcv2_numbers, int mcv2_nvalues,
675 : : Oid operator)
676 : : {
677 : 0 : Selectivity selec = 0.0;
678 : : FmgrInfo proc;
679 : : int i,
680 : : j;
681 : :
682 : 0 : fmgr_info(get_opcode(operator), &proc);
683 : :
684 [ # # ]: 0 : for (i = 0; i < mcv1_nvalues; i++)
685 : : {
686 [ # # ]: 0 : for (j = 0; j < mcv2_nvalues; j++)
687 [ # # ]: 0 : if (DatumGetBool(FunctionCall2(&proc,
688 : : mcv1_values[i],
689 : : mcv2_values[j])))
690 : 0 : selec += mcv1_numbers[i] * mcv2_numbers[j];
691 : : }
692 : 0 : return selec;
693 : : }
694 : :
695 : : /*
696 : : * Inet MCV vs histogram join selectivity estimation
697 : : *
698 : : * For each MCV on the lefthand side, estimate the fraction of the righthand's
699 : : * histogram population that satisfies the join clause, and add those up,
700 : : * scaling by the MCV's frequency. The result still needs to be scaled
701 : : * according to the fraction of the righthand's population represented by
702 : : * the histogram.
703 : : */
704 : : static Selectivity
705 : 0 : inet_mcv_hist_sel(Datum *mcv_values, float4 *mcv_numbers, int mcv_nvalues,
706 : : Datum *hist_values, int hist_nvalues,
707 : : int opr_codenum)
708 : : {
709 : 0 : Selectivity selec = 0.0;
710 : : int i;
711 : :
712 : : /*
713 : : * We'll call inet_hist_value_selec with the histogram on the left, so we
714 : : * must commute the operator.
715 : : */
716 : 0 : opr_codenum = -opr_codenum;
717 : :
718 [ # # ]: 0 : for (i = 0; i < mcv_nvalues; i++)
719 : : {
720 : 0 : selec += mcv_numbers[i] *
721 : 0 : inet_hist_value_sel(hist_values, hist_nvalues, mcv_values[i],
722 : : opr_codenum);
723 : : }
724 : 0 : return selec;
725 : : }
726 : :
727 : : /*
728 : : * Inet histogram vs histogram join selectivity estimation
729 : : *
730 : : * Here, we take all values listed in the second histogram (except for the
731 : : * first and last elements, which are excluded on the grounds of possibly
732 : : * not being very representative) and treat them as a uniform sample of
733 : : * the non-MCV population for that relation. For each one, we apply
734 : : * inet_hist_value_selec to see what fraction of the first histogram
735 : : * it matches.
736 : : *
737 : : * We could alternatively do this the other way around using the operator's
738 : : * commutator. XXX would it be worthwhile to do it both ways and take the
739 : : * average? That would at least avoid non-commutative estimation results.
740 : : */
741 : : static Selectivity
742 : 0 : inet_hist_inclusion_join_sel(Datum *hist1_values, int hist1_nvalues,
743 : : Datum *hist2_values, int hist2_nvalues,
744 : : int opr_codenum)
745 : : {
746 : 0 : double match = 0.0;
747 : : int i,
748 : : k,
749 : : n;
750 : :
751 [ # # ]: 0 : if (hist2_nvalues <= 2)
752 : 0 : return 0.0; /* no interior histogram elements */
753 : :
754 : : /* if there are too many histogram elements, decimate to limit runtime */
755 : 0 : k = (hist2_nvalues - 3) / MAX_CONSIDERED_ELEMS + 1;
756 : :
757 : 0 : n = 0;
758 [ # # ]: 0 : for (i = 1; i < hist2_nvalues - 1; i += k)
759 : : {
760 : 0 : match += inet_hist_value_sel(hist1_values, hist1_nvalues,
761 : 0 : hist2_values[i], opr_codenum);
762 : 0 : n++;
763 : : }
764 : :
765 : 0 : return match / n;
766 : : }
767 : :
768 : : /*
769 : : * Inet semi join selectivity estimation for one value
770 : : *
771 : : * The function calculates the probability that there is at least one row
772 : : * in the RHS table that satisfies the "lhs_value op column" condition.
773 : : * It is used in semi join estimation to check a sample from the left hand
774 : : * side table.
775 : : *
776 : : * The MCV and histogram from the right hand side table should be provided as
777 : : * arguments with the lhs_value from the left hand side table for the join.
778 : : * hist_weight is the total number of rows represented by the histogram.
779 : : * For example, if the table has 1000 rows, and 10% of the rows are in the MCV
780 : : * list, and another 10% are NULLs, hist_weight would be 800.
781 : : *
782 : : * First, the lhs_value will be matched to the most common values. If it
783 : : * matches any of them, 1.0 will be returned, because then there is surely
784 : : * a match.
785 : : *
786 : : * Otherwise, the histogram will be used to estimate the number of rows in
787 : : * the second table that match the condition. If the estimate is greater
788 : : * than 1.0, 1.0 will be returned, because it means there is a greater chance
789 : : * that the lhs_value will match more than one row in the table. If it is
790 : : * between 0.0 and 1.0, it will be returned as the probability.
791 : : */
792 : : static Selectivity
793 : 0 : inet_semi_join_sel(Datum lhs_value,
794 : : bool mcv_exists, Datum *mcv_values, int mcv_nvalues,
795 : : bool hist_exists, Datum *hist_values, int hist_nvalues,
796 : : double hist_weight,
797 : : FmgrInfo *proc, int opr_codenum)
798 : : {
799 [ # # ]: 0 : if (mcv_exists)
800 : : {
801 : : int i;
802 : :
803 [ # # ]: 0 : for (i = 0; i < mcv_nvalues; i++)
804 : : {
805 [ # # ]: 0 : if (DatumGetBool(FunctionCall2(proc,
806 : : lhs_value,
807 : : mcv_values[i])))
808 : 0 : return 1.0;
809 : : }
810 : : }
811 : :
812 [ # # # # ]: 0 : if (hist_exists && hist_weight > 0)
813 : : {
814 : : Selectivity hist_selec;
815 : :
816 : : /* Commute operator, since we're passing lhs_value on the right */
817 : 0 : hist_selec = inet_hist_value_sel(hist_values, hist_nvalues,
818 : : lhs_value, -opr_codenum);
819 : :
820 [ # # ]: 0 : if (hist_selec > 0)
821 [ # # ]: 0 : return Min(1.0, hist_weight * hist_selec);
822 : : }
823 : :
824 : 0 : return 0.0;
825 : : }
826 : :
827 : : /*
828 : : * Assign useful code numbers for the subnet inclusion/overlap operators
829 : : *
830 : : * Only inet_masklen_inclusion_cmp() and inet_hist_match_divider() depend
831 : : * on the exact codes assigned here; but many other places in this file
832 : : * know that they can negate a code to obtain the code for the commutator
833 : : * operator.
834 : : */
835 : : static int
836 : 0 : inet_opr_codenum(Oid operator)
837 : : {
838 [ # # # # : 0 : switch (operator)
# # ]
839 : : {
840 : 0 : case OID_INET_SUP_OP:
841 : 0 : return -2;
842 : 0 : case OID_INET_SUPEQ_OP:
843 : 0 : return -1;
844 : 0 : case OID_INET_OVERLAP_OP:
845 : 0 : return 0;
846 : 0 : case OID_INET_SUBEQ_OP:
847 : 0 : return 1;
848 : 0 : case OID_INET_SUB_OP:
849 : 0 : return 2;
850 : 0 : default:
851 [ # # ]: 0 : elog(ERROR, "unrecognized operator %u for inet selectivity",
852 : : operator);
853 : : }
854 : : return 0; /* unreached, but keep compiler quiet */
855 : : }
856 : :
857 : : /*
858 : : * Comparison function for the subnet inclusion/overlap operators
859 : : *
860 : : * If the comparison is okay for the specified inclusion operator, the return
861 : : * value will be 0. Otherwise the return value will be less than or greater
862 : : * than 0 as appropriate for the operator.
863 : : *
864 : : * Comparison is compatible with the basic comparison function for the inet
865 : : * type. See network_cmp_internal() in network.c for the original. Basic
866 : : * comparison operators are implemented with the network_cmp_internal()
867 : : * function. It is possible to implement the subnet inclusion operators with
868 : : * this function.
869 : : *
870 : : * Comparison is first on the common bits of the network part, then on the
871 : : * length of the network part (masklen) as in the network_cmp_internal()
872 : : * function. Only the first part is in this function. The second part is
873 : : * separated to another function for reusability. The difference between the
874 : : * second part and the original network_cmp_internal() is that the inclusion
875 : : * operator is considered while comparing the lengths of the network parts.
876 : : * See the inet_masklen_inclusion_cmp() function below.
877 : : */
878 : : static int
879 : 0 : inet_inclusion_cmp(inet *left, inet *right, int opr_codenum)
880 : : {
881 [ # # # # : 0 : if (ip_family(left) == ip_family(right))
# # ]
882 : : {
883 : : int order;
884 : :
885 [ # # ]: 0 : order = bitncmp(ip_addr(left), ip_addr(right),
886 [ # # # # : 0 : Min(ip_bits(left), ip_bits(right)));
# # ]
887 [ # # ]: 0 : if (order != 0)
888 : 0 : return order;
889 : :
890 : 0 : return inet_masklen_inclusion_cmp(left, right, opr_codenum);
891 : : }
892 : :
893 [ # # # # ]: 0 : return ip_family(left) - ip_family(right);
894 : : }
895 : :
896 : : /*
897 : : * Masklen comparison function for the subnet inclusion/overlap operators
898 : : *
899 : : * Compares the lengths of the network parts of the inputs. If the comparison
900 : : * is okay for the specified inclusion operator, the return value will be 0.
901 : : * Otherwise the return value will be less than or greater than 0 as
902 : : * appropriate for the operator.
903 : : */
904 : : static int
905 : 0 : inet_masklen_inclusion_cmp(inet *left, inet *right, int opr_codenum)
906 : : {
907 : : int order;
908 : :
909 [ # # # # ]: 0 : order = (int) ip_bits(left) - (int) ip_bits(right);
910 : :
911 : : /*
912 : : * Return 0 if the operator would accept this combination of masklens.
913 : : * Note that opr_codenum zero (overlaps) will accept all cases.
914 : : */
915 [ # # # # : 0 : if ((order > 0 && opr_codenum >= 0) ||
# # ]
916 [ # # # # : 0 : (order == 0 && opr_codenum >= -1 && opr_codenum <= 1) ||
# # ]
917 [ # # ]: 0 : (order < 0 && opr_codenum <= 0))
918 : 0 : return 0;
919 : :
920 : : /*
921 : : * Otherwise, return a negative value for sup/supeq (notionally, the RHS
922 : : * needs to have a larger masklen than it has, which would make it sort
923 : : * later), or a positive value for sub/subeq (vice versa).
924 : : */
925 : 0 : return opr_codenum;
926 : : }
927 : :
928 : : /*
929 : : * Inet histogram partial match divider calculation
930 : : *
931 : : * First the families and the lengths of the network parts are compared using
932 : : * the subnet inclusion operator. If those are acceptable for the operator,
933 : : * the divider will be calculated using the masklens and the common bits of
934 : : * the addresses. -1 will be returned if it cannot be calculated.
935 : : *
936 : : * See commentary for inet_hist_value_sel() for some rationale for this.
937 : : */
938 : : static int
939 : 0 : inet_hist_match_divider(inet *boundary, inet *query, int opr_codenum)
940 : : {
941 [ # # # # : 0 : if (ip_family(boundary) == ip_family(query) &&
# # # # ]
942 : 0 : inet_masklen_inclusion_cmp(boundary, query, opr_codenum) == 0)
943 : : {
944 : : int min_bits,
945 : : decisive_bits;
946 : :
947 [ # # # # ]: 0 : min_bits = Min(ip_bits(boundary), ip_bits(query));
948 : :
949 : : /*
950 : : * Set decisive_bits to the masklen of the one that should contain the
951 : : * other according to the operator.
952 : : */
953 [ # # ]: 0 : if (opr_codenum < 0)
954 [ # # ]: 0 : decisive_bits = ip_bits(boundary);
955 [ # # ]: 0 : else if (opr_codenum > 0)
956 [ # # ]: 0 : decisive_bits = ip_bits(query);
957 : : else
958 : 0 : decisive_bits = min_bits;
959 : :
960 : : /*
961 : : * Now return the number of non-common decisive bits. (This will be
962 : : * zero if the boundary and query in fact match, else positive.)
963 : : */
964 [ # # ]: 0 : if (min_bits > 0)
965 : 0 : return decisive_bits - bitncommon(ip_addr(boundary),
966 [ # # # # ]: 0 : ip_addr(query),
967 : : min_bits);
968 : 0 : return decisive_bits;
969 : : }
970 : :
971 : 0 : return -1;
972 : : }
|