Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * sampling.c
4 : : * Relation block sampling routines.
5 : : *
6 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/utils/misc/sampling.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include <math.h>
19 : :
20 : : #include "utils/sampling.h"
21 : :
22 : :
23 : : /*
24 : : * BlockSampler_Init -- prepare for random sampling of blocknumbers
25 : : *
26 : : * BlockSampler provides algorithm for block level sampling of a relation
27 : : * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 : : * It selects a random sample of samplesize blocks out of
29 : : * the nblocks blocks in the table. If the table has less than
30 : : * samplesize blocks, all blocks are selected.
31 : : *
32 : : * Since we know the total number of blocks in advance, we can use the
33 : : * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34 : : * algorithm.
35 : : *
36 : : * Returns the number of blocks that BlockSampler_Next will return.
37 : : */
38 : : BlockNumber
3257 simon@2ndQuadrant.co 39 :CBC 6819 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
40 : : uint32 randseed)
41 : : {
42 : 6819 : bs->N = nblocks; /* measured table size */
43 : :
44 : : /*
45 : : * If we decide to reduce samplesize for tables that have less or not much
46 : : * more than samplesize blocks, here is the place to do it.
47 : : */
48 : 6819 : bs->n = samplesize;
49 : 6819 : bs->t = 0; /* blocks scanned so far */
50 : 6819 : bs->m = 0; /* blocks selected so far */
51 : :
868 tgl@sss.pgh.pa.us 52 : 6819 : sampler_random_init_state(randseed, &bs->randstate);
53 : :
1551 alvherre@alvh.no-ip. 54 : 6819 : return Min(bs->n, bs->N);
55 : : }
56 : :
57 : : bool
3257 simon@2ndQuadrant.co 58 : 129603 : BlockSampler_HasMore(BlockSampler bs)
59 : : {
60 [ + + + - ]: 129603 : return (bs->t < bs->N) && (bs->m < bs->n);
61 : : }
62 : :
63 : : BlockNumber
64 : 61392 : BlockSampler_Next(BlockSampler bs)
65 : : {
2489 tgl@sss.pgh.pa.us 66 : 61392 : BlockNumber K = bs->N - bs->t; /* remaining blocks */
67 : 61392 : int k = bs->n - bs->m; /* blocks still to sample */
68 : : double p; /* probability to skip block */
69 : : double V; /* random */
70 : :
3257 simon@2ndQuadrant.co 71 [ - + ]: 61392 : Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
72 : :
73 [ + - ]: 61392 : if ((BlockNumber) k >= K)
74 : : {
75 : : /* need all the rest */
76 : 61392 : bs->m++;
77 : 61392 : return bs->t++;
78 : : }
79 : :
80 : : /*----------
81 : : * It is not obvious that this code matches Knuth's Algorithm S.
82 : : * Knuth says to skip the current block with probability 1 - k/K.
83 : : * If we are to skip, we should advance t (hence decrease K), and
84 : : * repeat the same probabilistic test for the next block. The naive
85 : : * implementation thus requires a sampler_random_fract() call for each
86 : : * block number. But we can reduce this to one sampler_random_fract()
87 : : * call per selected block, by noting that each time the while-test
88 : : * succeeds, we can reinterpret V as a uniform random number in the range
89 : : * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
90 : : * the appropriate fraction of its former value, and our next loop
91 : : * makes the appropriate probabilistic test.
92 : : *
93 : : * We have initially K > k > 0. If the loop reduces K to equal k,
94 : : * the next while-test must fail since p will become exactly zero
95 : : * (we assume there will not be roundoff error in the division).
96 : : * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
97 : : * to be doubly sure about roundoff error.) Therefore K cannot become
98 : : * less than k, which means that we cannot fail to select enough blocks.
99 : : *----------
100 : : */
868 tgl@sss.pgh.pa.us 101 :UBC 0 : V = sampler_random_fract(&bs->randstate);
3257 simon@2ndQuadrant.co 102 : 0 : p = 1.0 - (double) k / (double) K;
103 [ # # ]: 0 : while (V < p)
104 : : {
105 : : /* skip */
106 : 0 : bs->t++;
107 : 0 : K--; /* keep K == N - t */
108 : :
109 : : /* adjust p to be new cutoff point in reduced range */
110 : 0 : p *= 1.0 - (double) k / (double) K;
111 : : }
112 : :
113 : : /* select */
114 : 0 : bs->m++;
115 : 0 : return bs->t++;
116 : : }
117 : :
118 : : /*
119 : : * These two routines embody Algorithm Z from "Random sampling with a
120 : : * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
121 : : * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
122 : : * of the count S of records to skip before processing another record.
123 : : * It is computed primarily based on t, the number of records already read.
124 : : * The only extra state needed between calls is W, a random state variable.
125 : : *
126 : : * reservoir_init_selection_state computes the initial W value.
127 : : *
128 : : * Given that we've already read t records (t >= n), reservoir_get_next_S
129 : : * determines the number of records to skip before the next record is
130 : : * processed.
131 : : */
132 : : void
3257 simon@2ndQuadrant.co 133 :CBC 6859 : reservoir_init_selection_state(ReservoirState rs, int n)
134 : : {
135 : : /*
136 : : * Reservoir sampling is not used anywhere where it would need to return
137 : : * repeatable results so we can initialize it randomly.
138 : : */
868 tgl@sss.pgh.pa.us 139 : 6859 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
140 : : &rs->randstate);
141 : :
142 : : /* Initial value of W (for use when Algorithm Z is first applied) */
143 : 6859 : rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
3257 simon@2ndQuadrant.co 144 : 6859 : }
145 : :
146 : : double
147 : 57702 : reservoir_get_next_S(ReservoirState rs, double t, int n)
148 : : {
149 : : double S;
150 : :
151 : : /* The magic constant here is T from Vitter's paper */
152 [ + - ]: 57702 : if (t <= (22.0 * n))
153 : : {
154 : : /* Process records using Algorithm X until t is large enough */
155 : : double V,
156 : : quot;
157 : :
868 tgl@sss.pgh.pa.us 158 : 57702 : V = sampler_random_fract(&rs->randstate); /* Generate V */
3257 simon@2ndQuadrant.co 159 : 57702 : S = 0;
160 : 57702 : t += 1;
161 : : /* Note: "num" in Vitter's code is always equal to t - n */
162 : 57702 : quot = (t - (double) n) / t;
163 : : /* Find min S satisfying (4.1) */
164 [ + + ]: 126799 : while (quot > V)
165 : : {
166 : 69097 : S += 1;
167 : 69097 : t += 1;
168 : 69097 : quot *= (t - (double) n) / t;
169 : : }
170 : : }
171 : : else
172 : : {
173 : : /* Now apply Algorithm Z */
3257 simon@2ndQuadrant.co 174 :UBC 0 : double W = rs->W;
175 : 0 : double term = t - (double) n + 1;
176 : :
177 : : for (;;)
178 : 0 : {
179 : : double numer,
180 : : numer_lim,
181 : : denom;
182 : : double U,
183 : : X,
184 : : lhs,
185 : : rhs,
186 : : y,
187 : : tmp;
188 : :
189 : : /* Generate U and X */
868 tgl@sss.pgh.pa.us 190 : 0 : U = sampler_random_fract(&rs->randstate);
3257 simon@2ndQuadrant.co 191 : 0 : X = t * (W - 1.0);
192 : 0 : S = floor(X); /* S is tentatively set to floor(X) */
193 : : /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
194 : 0 : tmp = (t + 1) / term;
195 : 0 : lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
196 : 0 : rhs = (((t + X) / (term + S)) * term) / t;
197 [ # # ]: 0 : if (lhs <= rhs)
198 : : {
199 : 0 : W = rhs / lhs;
200 : 0 : break;
201 : : }
202 : : /* Test if U <= f(S)/cg(X) */
203 : 0 : y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
204 [ # # ]: 0 : if ((double) n < S)
205 : : {
206 : 0 : denom = t;
207 : 0 : numer_lim = term + S;
208 : : }
209 : : else
210 : : {
211 : 0 : denom = t - (double) n + S;
212 : 0 : numer_lim = t + 1;
213 : : }
214 [ # # ]: 0 : for (numer = t + S; numer >= numer_lim; numer -= 1)
215 : : {
216 : 0 : y *= numer / denom;
217 : 0 : denom -= 1;
218 : : }
868 tgl@sss.pgh.pa.us 219 : 0 : W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
3257 simon@2ndQuadrant.co 220 [ # # ]: 0 : if (exp(log(y) / n) <= (t + X) / t)
221 : 0 : break;
222 : : }
223 : 0 : rs->W = W;
224 : : }
3257 simon@2ndQuadrant.co 225 :CBC 57702 : return S;
226 : : }
227 : :
228 : :
229 : : /*----------
230 : : * Random number generator used by sampling
231 : : *----------
232 : : */
233 : : void
868 tgl@sss.pgh.pa.us 234 : 13688 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
235 : : {
236 : 13688 : pg_prng_seed(randstate, (uint64) seed);
3257 simon@2ndQuadrant.co 237 : 13688 : }
238 : :
239 : : /* Select a random value R uniformly distributed in (0 - 1) */
240 : : double
868 tgl@sss.pgh.pa.us 241 : 122258 : sampler_random_fract(pg_prng_state *randstate)
242 : : {
243 : : double res;
244 : :
245 : : /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
246 : : do
247 : : {
248 : 122258 : res = pg_prng_double(randstate);
249 [ - + ]: 122258 : } while (unlikely(res == 0.0));
3210 250 : 122258 : return res;
251 : : }
252 : :
253 : :
254 : : /*
255 : : * Backwards-compatible API for block sampling
256 : : *
257 : : * This code is now deprecated, but since it's still in use by many FDWs,
258 : : * we should keep it for awhile at least. The functionality is the same as
259 : : * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
260 : : * except that a common random state is used across all callers.
261 : : */
262 : : static ReservoirStateData oldrs;
263 : : static bool oldrs_initialized = false;
264 : :
265 : : double
3254 tgl@sss.pgh.pa.us 266 :UBC 0 : anl_random_fract(void)
267 : : {
268 : : /* initialize if first time through */
868 269 [ # # ]: 0 : if (unlikely(!oldrs_initialized))
270 : : {
271 : 0 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
272 : : &oldrs.randstate);
273 : 0 : oldrs_initialized = true;
274 : : }
275 : :
276 : : /* and compute a random fraction */
277 : 0 : return sampler_random_fract(&oldrs.randstate);
278 : : }
279 : :
280 : : double
3254 281 : 0 : anl_init_selection_state(int n)
282 : : {
283 : : /* initialize if first time through */
868 284 [ # # ]: 0 : if (unlikely(!oldrs_initialized))
285 : : {
286 : 0 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
287 : : &oldrs.randstate);
288 : 0 : oldrs_initialized = true;
289 : : }
290 : :
291 : : /* Initial value of W (for use when Algorithm Z is first applied) */
292 : 0 : return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
293 : : }
294 : :
295 : : double
3254 296 : 0 : anl_get_next_S(double t, int n, double *stateptr)
297 : : {
298 : : double result;
299 : :
300 : 0 : oldrs.W = *stateptr;
301 : 0 : result = reservoir_get_next_S(&oldrs, t, n);
302 : 0 : *stateptr = oldrs.W;
303 : 0 : return result;
304 : : }
|