LCOV - differential code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 44.2 % 86 38 48 38
Current Date: 2023-04-08 17:13:01 Functions: 70.0 % 10 7 3 7
Baseline: 15 Line coverage date bins:
Baseline Date: 2023-04-08 15:09:40 (240..) days: 44.2 % 86 38 48 38
Legend: Lines: hit not hit Function coverage date bins:
(240..) days: 70.0 % 10 7 3 7

 Age         Owner                  TLA  Line data    Source code
                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * sampling.c
                                  4                 :  *    Relation block sampling routines.
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :  *
                                  9                 :  *
                                 10                 :  * IDENTIFICATION
                                 11                 :  *    src/backend/utils/misc/sampling.c
                                 12                 :  *
                                 13                 :  *-------------------------------------------------------------------------
                                 14                 :  */
                                 15                 : 
                                 16                 : #include "postgres.h"
                                 17                 : 
                                 18                 : #include <math.h>
                                 19                 : 
                                 20                 : #include "utils/sampling.h"
                                 21                 : 
                                 22                 : 
                                 23                 : /*
                                 24                 :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
                                 25                 :  *
                                 26                 :  * BlockSampler provides algorithm for block level sampling of a relation
                                 27                 :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
                                 28                 :  * It selects a random sample of samplesize blocks out of
                                 29                 :  * the nblocks blocks in the table. If the table has less than
                                 30                 :  * samplesize blocks, all blocks are selected.
                                 31                 :  *
                                 32                 :  * Since we know the total number of blocks in advance, we can use the
                                 33                 :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
                                 34                 :  * algorithm.
                                 35                 :  *
                                 36                 :  * Returns the number of blocks that BlockSampler_Next will return.
                                 37                 :  */
                                 38                 : BlockNumber
 2886 simon                      39 CBC       48512 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
                                 40                 :                   uint32 randseed)
                                 41                 : {
                                 42           48512 :     bs->N = nblocks;         /* measured table size */
                                 43                 : 
                                 44                 :     /*
                                 45                 :      * If we decide to reduce samplesize for tables that have less or not much
                                 46                 :      * more than samplesize blocks, here is the place to do it.
                                 47                 :      */
                                 48           48512 :     bs->n = samplesize;
                                 49           48512 :     bs->t = 0;                   /* blocks scanned so far */
                                 50           48512 :     bs->m = 0;                   /* blocks selected so far */
                                 51                 : 
  497 tgl                        52           48512 :     sampler_random_init_state(randseed, &bs->randstate);
                                 53                 : 
 1180 alvherre                   54           48512 :     return Min(bs->n, bs->N);
                                 55                 : }
                                 56                 : 
                                 57                 : bool
 2886 simon                      58          691181 : BlockSampler_HasMore(BlockSampler bs)
                                 59                 : {
                                 60          691181 :     return (bs->t < bs->N) && (bs->m < bs->n);
                                 61                 : }
                                 62                 : 
                                 63                 : BlockNumber
                                 64          294716 : BlockSampler_Next(BlockSampler bs)
                                 65                 : {
 2118 tgl                        66          294716 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
                                 67          294716 :     int         k = bs->n - bs->m;    /* blocks still to sample */
                                 68                 :     double      p;              /* probability to skip block */
                                 69                 :     double      V;              /* random */
                                 70                 : 
 2886 simon                      71          294716 :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
                                 72                 : 
                                 73          294716 :     if ((BlockNumber) k >= K)
                                 74                 :     {
                                 75                 :         /* need all the rest */
                                 76          294716 :         bs->m++;
                                 77          294716 :         return bs->t++;
                                 78                 :     }
                                 79                 : 
                                 80                 :     /*----------
                                 81                 :      * It is not obvious that this code matches Knuth's Algorithm S.
                                 82                 :      * Knuth says to skip the current block with probability 1 - k/K.
                                 83                 :      * If we are to skip, we should advance t (hence decrease K), and
                                 84                 :      * repeat the same probabilistic test for the next block.  The naive
                                 85                 :      * implementation thus requires a sampler_random_fract() call for each
                                 86                 :      * block number.  But we can reduce this to one sampler_random_fract()
                                 87                 :      * call per selected block, by noting that each time the while-test
                                 88                 :      * succeeds, we can reinterpret V as a uniform random number in the range
                                 89                 :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
                                 90                 :      * the appropriate fraction of its former value, and our next loop
                                 91                 :      * makes the appropriate probabilistic test.
                                 92                 :      *
                                 93                 :      * We have initially K > k > 0.  If the loop reduces K to equal k,
                                 94                 :      * the next while-test must fail since p will become exactly zero
                                 95                 :      * (we assume there will not be roundoff error in the division).
                                 96                 :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
                                 97                 :      * to be doubly sure about roundoff error.)  Therefore K cannot become
                                 98                 :      * less than k, which means that we cannot fail to select enough blocks.
                                 99                 :      *----------
                                100                 :      */
  497 tgl                       101 UBC           0 :     V = sampler_random_fract(&bs->randstate);
 2886 simon                     102               0 :     p = 1.0 - (double) k / (double) K;
                                103               0 :     while (V < p)
                                104                 :     {
                                105                 :         /* skip */
                                106               0 :         bs->t++;
                                107               0 :         K--;                    /* keep K == N - t */
                                108                 : 
                                109                 :         /* adjust p to be new cutoff point in reduced range */
                                110               0 :         p *= 1.0 - (double) k / (double) K;
                                111                 :     }
                                112                 : 
                                113                 :     /* select */
                                114               0 :     bs->m++;
                                115               0 :     return bs->t++;
                                116                 : }
                                117                 : 
                                118                 : /*
                                119                 :  * These two routines embody Algorithm Z from "Random sampling with a
                                120                 :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
                                121                 :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
                                122                 :  * of the count S of records to skip before processing another record.
                                123                 :  * It is computed primarily based on t, the number of records already read.
                                124                 :  * The only extra state needed between calls is W, a random state variable.
                                125                 :  *
                                126                 :  * reservoir_init_selection_state computes the initial W value.
                                127                 :  *
                                128                 :  * Given that we've already read t records (t >= n), reservoir_get_next_S
                                129                 :  * determines the number of records to skip before the next record is
                                130                 :  * processed.
                                131                 :  */
                                132                 : void
 2886 simon                     133 CBC       24296 : reservoir_init_selection_state(ReservoirState rs, int n)
                                134                 : {
                                135                 :     /*
                                136                 :      * Reservoir sampling is not used anywhere where it would need to return
                                137                 :      * repeatable results so we can initialize it randomly.
                                138                 :      */
  497 tgl                       139           24296 :     sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                140                 :                               &rs->randstate);
                                141                 : 
                                142                 :     /* Initial value of W (for use when Algorithm Z is first applied) */
                                143           24296 :     rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
 2886 simon                     144           24296 : }
                                145                 : 
                                146                 : double
                                147           57678 : reservoir_get_next_S(ReservoirState rs, double t, int n)
                                148                 : {
                                149                 :     double      S;
                                150                 : 
                                151                 :     /* The magic constant here is T from Vitter's paper */
                                152           57678 :     if (t <= (22.0 * n))
                                153                 :     {
                                154                 :         /* Process records using Algorithm X until t is large enough */
                                155                 :         double      V,
                                156                 :                     quot;
                                157                 : 
  497 tgl                       158           57678 :         V = sampler_random_fract(&rs->randstate);    /* Generate V */
 2886 simon                     159           57678 :         S = 0;
                                160           57678 :         t += 1;
                                161                 :         /* Note: "num" in Vitter's code is always equal to t - n */
                                162           57678 :         quot = (t - (double) n) / t;
                                163                 :         /* Find min S satisfying (4.1) */
                                164          126325 :         while (quot > V)
                                165                 :         {
                                166           68647 :             S += 1;
                                167           68647 :             t += 1;
                                168           68647 :             quot *= (t - (double) n) / t;
                                169                 :         }
                                170                 :     }
                                171                 :     else
                                172                 :     {
                                173                 :         /* Now apply Algorithm Z */
 2886 simon                     174 UBC           0 :         double      W = rs->W;
                                175               0 :         double      term = t - (double) n + 1;
                                176                 : 
                                177                 :         for (;;)
                                178               0 :         {
                                179                 :             double      numer,
                                180                 :                         numer_lim,
                                181                 :                         denom;
                                182                 :             double      U,
                                183                 :                         X,
                                184                 :                         lhs,
                                185                 :                         rhs,
                                186                 :                         y,
                                187                 :                         tmp;
                                188                 : 
                                189                 :             /* Generate U and X */
  497 tgl                       190               0 :             U = sampler_random_fract(&rs->randstate);
 2886 simon                     191               0 :             X = t * (W - 1.0);
                                192               0 :             S = floor(X);       /* S is tentatively set to floor(X) */
                                193                 :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
                                194               0 :             tmp = (t + 1) / term;
                                195               0 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
                                196               0 :             rhs = (((t + X) / (term + S)) * term) / t;
                                197               0 :             if (lhs <= rhs)
                                198                 :             {
                                199               0 :                 W = rhs / lhs;
                                200               0 :                 break;
                                201                 :             }
                                202                 :             /* Test if U <= f(S)/cg(X) */
                                203               0 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
                                204               0 :             if ((double) n < S)
                                205                 :             {
                                206               0 :                 denom = t;
                                207               0 :                 numer_lim = term + S;
                                208                 :             }
                                209                 :             else
                                210                 :             {
                                211               0 :                 denom = t - (double) n + S;
                                212               0 :                 numer_lim = t + 1;
                                213                 :             }
                                214               0 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
                                215                 :             {
                                216               0 :                 y *= numer / denom;
                                217               0 :                 denom -= 1;
                                218                 :             }
  497 tgl                       219               0 :             W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
 2886 simon                     220               0 :             if (exp(log(y) / n) <= (t + X) / t)
                                221               0 :                 break;
                                222                 :         }
                                223               0 :         rs->W = W;
                                224                 :     }
 2886 simon                     225 CBC       57678 :     return S;
                                226                 : }
                                227                 : 
                                228                 : 
                                229                 : /*----------
                                230                 :  * Random number generator used by sampling
                                231                 :  *----------
                                232                 :  */
                                233                 : void
  497 tgl                       234           72818 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
                                235                 : {
                                236           72818 :     pg_prng_seed(randstate, (uint64) seed);
 2886 simon                     237           72818 : }
                                238                 : 
                                239                 : /* Select a random value R uniformly distributed in (0 - 1) */
                                240                 : double
  497 tgl                       241          139649 : sampler_random_fract(pg_prng_state *randstate)
                                242                 : {
                                243                 :     double      res;
                                244                 : 
                                245                 :     /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
                                246                 :     do
                                247                 :     {
                                248          139649 :         res = pg_prng_double(randstate);
                                249          139649 :     } while (unlikely(res == 0.0));
 2839                           250          139649 :     return res;
                                251                 : }
                                252                 : 
                                253                 : 
                                254                 : /*
                                255                 :  * Backwards-compatible API for block sampling
                                256                 :  *
                                257                 :  * This code is now deprecated, but since it's still in use by many FDWs,
                                258                 :  * we should keep it for awhile at least.  The functionality is the same as
                                259                 :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
                                260                 :  * except that a common random state is used across all callers.
                                261                 :  */
                                262                 : static ReservoirStateData oldrs;
                                263                 : static bool oldrs_initialized = false;
                                264                 : 
                                265                 : double
 2883 tgl                       266 UBC           0 : anl_random_fract(void)
                                267                 : {
                                268                 :     /* initialize if first time through */
  497                           269               0 :     if (unlikely(!oldrs_initialized))
                                270                 :     {
                                271               0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                272                 :                                   &oldrs.randstate);
                                273               0 :         oldrs_initialized = true;
                                274                 :     }
                                275                 : 
                                276                 :     /* and compute a random fraction */
                                277               0 :     return sampler_random_fract(&oldrs.randstate);
                                278                 : }
                                279                 : 
                                280                 : double
 2883                           281               0 : anl_init_selection_state(int n)
                                282                 : {
                                283                 :     /* initialize if first time through */
  497                           284               0 :     if (unlikely(!oldrs_initialized))
                                285                 :     {
                                286               0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                287                 :                                   &oldrs.randstate);
                                288               0 :         oldrs_initialized = true;
                                289                 :     }
                                290                 : 
                                291                 :     /* Initial value of W (for use when Algorithm Z is first applied) */
                                292               0 :     return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
                                293                 : }
                                294                 : 
                                295                 : double
 2883                           296               0 : anl_get_next_S(double t, int n, double *stateptr)
                                297                 : {
                                298                 :     double      result;
                                299                 : 
                                300               0 :     oldrs.W = *stateptr;
                                301               0 :     result = reservoir_get_next_S(&oldrs, t, n);
                                302               0 :     *stateptr = oldrs.W;
                                303               0 :     return result;
                                304                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a