Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * sampling.c
4 : * Relation block sampling routines.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/misc/sampling.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include <math.h>
19 :
20 : #include "utils/sampling.h"
21 :
22 :
23 : /*
24 : * BlockSampler_Init -- prepare for random sampling of blocknumbers
25 : *
26 : * BlockSampler provides algorithm for block level sampling of a relation
27 : * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
28 : * It selects a random sample of samplesize blocks out of
29 : * the nblocks blocks in the table. If the table has less than
30 : * samplesize blocks, all blocks are selected.
31 : *
32 : * Since we know the total number of blocks in advance, we can use the
33 : * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
34 : * algorithm.
35 : *
36 : * Returns the number of blocks that BlockSampler_Next will return.
37 : */
38 : BlockNumber
2886 simon 39 CBC 48512 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
40 : uint32 randseed)
41 : {
42 48512 : bs->N = nblocks; /* measured table size */
43 :
44 : /*
45 : * If we decide to reduce samplesize for tables that have less or not much
46 : * more than samplesize blocks, here is the place to do it.
47 : */
48 48512 : bs->n = samplesize;
49 48512 : bs->t = 0; /* blocks scanned so far */
50 48512 : bs->m = 0; /* blocks selected so far */
51 :
497 tgl 52 48512 : sampler_random_init_state(randseed, &bs->randstate);
53 :
1180 alvherre 54 48512 : return Min(bs->n, bs->N);
55 : }
56 :
57 : bool
2886 simon 58 691181 : BlockSampler_HasMore(BlockSampler bs)
59 : {
60 691181 : return (bs->t < bs->N) && (bs->m < bs->n);
61 : }
62 :
63 : BlockNumber
64 294716 : BlockSampler_Next(BlockSampler bs)
65 : {
2118 tgl 66 294716 : BlockNumber K = bs->N - bs->t; /* remaining blocks */
67 294716 : int k = bs->n - bs->m; /* blocks still to sample */
68 : double p; /* probability to skip block */
69 : double V; /* random */
70 :
2886 simon 71 294716 : Assert(BlockSampler_HasMore(bs)); /* hence K > 0 and k > 0 */
72 :
73 294716 : if ((BlockNumber) k >= K)
74 : {
75 : /* need all the rest */
76 294716 : bs->m++;
77 294716 : return bs->t++;
78 : }
79 :
80 : /*----------
81 : * It is not obvious that this code matches Knuth's Algorithm S.
82 : * Knuth says to skip the current block with probability 1 - k/K.
83 : * If we are to skip, we should advance t (hence decrease K), and
84 : * repeat the same probabilistic test for the next block. The naive
85 : * implementation thus requires a sampler_random_fract() call for each
86 : * block number. But we can reduce this to one sampler_random_fract()
87 : * call per selected block, by noting that each time the while-test
88 : * succeeds, we can reinterpret V as a uniform random number in the range
89 : * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
90 : * the appropriate fraction of its former value, and our next loop
91 : * makes the appropriate probabilistic test.
92 : *
93 : * We have initially K > k > 0. If the loop reduces K to equal k,
94 : * the next while-test must fail since p will become exactly zero
95 : * (we assume there will not be roundoff error in the division).
96 : * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
97 : * to be doubly sure about roundoff error.) Therefore K cannot become
98 : * less than k, which means that we cannot fail to select enough blocks.
99 : *----------
100 : */
497 tgl 101 UBC 0 : V = sampler_random_fract(&bs->randstate);
2886 simon 102 0 : p = 1.0 - (double) k / (double) K;
103 0 : while (V < p)
104 : {
105 : /* skip */
106 0 : bs->t++;
107 0 : K--; /* keep K == N - t */
108 :
109 : /* adjust p to be new cutoff point in reduced range */
110 0 : p *= 1.0 - (double) k / (double) K;
111 : }
112 :
113 : /* select */
114 0 : bs->m++;
115 0 : return bs->t++;
116 : }
117 :
118 : /*
119 : * These two routines embody Algorithm Z from "Random sampling with a
120 : * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
121 : * (Mar. 1985), Pages 37-57. Vitter describes his algorithm in terms
122 : * of the count S of records to skip before processing another record.
123 : * It is computed primarily based on t, the number of records already read.
124 : * The only extra state needed between calls is W, a random state variable.
125 : *
126 : * reservoir_init_selection_state computes the initial W value.
127 : *
128 : * Given that we've already read t records (t >= n), reservoir_get_next_S
129 : * determines the number of records to skip before the next record is
130 : * processed.
131 : */
132 : void
2886 simon 133 CBC 24296 : reservoir_init_selection_state(ReservoirState rs, int n)
134 : {
135 : /*
136 : * Reservoir sampling is not used anywhere where it would need to return
137 : * repeatable results so we can initialize it randomly.
138 : */
497 tgl 139 24296 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
140 : &rs->randstate);
141 :
142 : /* Initial value of W (for use when Algorithm Z is first applied) */
143 24296 : rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
2886 simon 144 24296 : }
145 :
146 : double
147 57678 : reservoir_get_next_S(ReservoirState rs, double t, int n)
148 : {
149 : double S;
150 :
151 : /* The magic constant here is T from Vitter's paper */
152 57678 : if (t <= (22.0 * n))
153 : {
154 : /* Process records using Algorithm X until t is large enough */
155 : double V,
156 : quot;
157 :
497 tgl 158 57678 : V = sampler_random_fract(&rs->randstate); /* Generate V */
2886 simon 159 57678 : S = 0;
160 57678 : t += 1;
161 : /* Note: "num" in Vitter's code is always equal to t - n */
162 57678 : quot = (t - (double) n) / t;
163 : /* Find min S satisfying (4.1) */
164 126325 : while (quot > V)
165 : {
166 68647 : S += 1;
167 68647 : t += 1;
168 68647 : quot *= (t - (double) n) / t;
169 : }
170 : }
171 : else
172 : {
173 : /* Now apply Algorithm Z */
2886 simon 174 UBC 0 : double W = rs->W;
175 0 : double term = t - (double) n + 1;
176 :
177 : for (;;)
178 0 : {
179 : double numer,
180 : numer_lim,
181 : denom;
182 : double U,
183 : X,
184 : lhs,
185 : rhs,
186 : y,
187 : tmp;
188 :
189 : /* Generate U and X */
497 tgl 190 0 : U = sampler_random_fract(&rs->randstate);
2886 simon 191 0 : X = t * (W - 1.0);
192 0 : S = floor(X); /* S is tentatively set to floor(X) */
193 : /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
194 0 : tmp = (t + 1) / term;
195 0 : lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
196 0 : rhs = (((t + X) / (term + S)) * term) / t;
197 0 : if (lhs <= rhs)
198 : {
199 0 : W = rhs / lhs;
200 0 : break;
201 : }
202 : /* Test if U <= f(S)/cg(X) */
203 0 : y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
204 0 : if ((double) n < S)
205 : {
206 0 : denom = t;
207 0 : numer_lim = term + S;
208 : }
209 : else
210 : {
211 0 : denom = t - (double) n + S;
212 0 : numer_lim = t + 1;
213 : }
214 0 : for (numer = t + S; numer >= numer_lim; numer -= 1)
215 : {
216 0 : y *= numer / denom;
217 0 : denom -= 1;
218 : }
497 tgl 219 0 : W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
2886 simon 220 0 : if (exp(log(y) / n) <= (t + X) / t)
221 0 : break;
222 : }
223 0 : rs->W = W;
224 : }
2886 simon 225 CBC 57678 : return S;
226 : }
227 :
228 :
229 : /*----------
230 : * Random number generator used by sampling
231 : *----------
232 : */
233 : void
497 tgl 234 72818 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
235 : {
236 72818 : pg_prng_seed(randstate, (uint64) seed);
2886 simon 237 72818 : }
238 :
239 : /* Select a random value R uniformly distributed in (0 - 1) */
240 : double
497 tgl 241 139649 : sampler_random_fract(pg_prng_state *randstate)
242 : {
243 : double res;
244 :
245 : /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
246 : do
247 : {
248 139649 : res = pg_prng_double(randstate);
249 139649 : } while (unlikely(res == 0.0));
2839 250 139649 : return res;
251 : }
252 :
253 :
254 : /*
255 : * Backwards-compatible API for block sampling
256 : *
257 : * This code is now deprecated, but since it's still in use by many FDWs,
258 : * we should keep it for awhile at least. The functionality is the same as
259 : * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
260 : * except that a common random state is used across all callers.
261 : */
262 : static ReservoirStateData oldrs;
263 : static bool oldrs_initialized = false;
264 :
265 : double
2883 tgl 266 UBC 0 : anl_random_fract(void)
267 : {
268 : /* initialize if first time through */
497 269 0 : if (unlikely(!oldrs_initialized))
270 : {
271 0 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
272 : &oldrs.randstate);
273 0 : oldrs_initialized = true;
274 : }
275 :
276 : /* and compute a random fraction */
277 0 : return sampler_random_fract(&oldrs.randstate);
278 : }
279 :
280 : double
2883 281 0 : anl_init_selection_state(int n)
282 : {
283 : /* initialize if first time through */
497 284 0 : if (unlikely(!oldrs_initialized))
285 : {
286 0 : sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
287 : &oldrs.randstate);
288 0 : oldrs_initialized = true;
289 : }
290 :
291 : /* Initial value of W (for use when Algorithm Z is first applied) */
292 0 : return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
293 : }
294 :
295 : double
2883 296 0 : anl_get_next_S(double t, int n, double *stateptr)
297 : {
298 : double result;
299 :
300 0 : oldrs.W = *stateptr;
301 0 : result = reservoir_get_next_S(&oldrs, t, n);
302 0 : *stateptr = oldrs.W;
303 0 : return result;
304 : }
|