LCOV - differential code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 44.2 % 86 38 48 38
Current Date: 2023-04-08 15:15:32 Functions: 70.0 % 10 7 3 7
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * sampling.c
       4                 :  *    Relation block sampling routines.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/utils/misc/sampling.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include <math.h>
      19                 : 
      20                 : #include "utils/sampling.h"
      21                 : 
      22                 : 
      23                 : /*
      24                 :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
      25                 :  *
      26                 :  * BlockSampler provides algorithm for block level sampling of a relation
      27                 :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
      28                 :  * It selects a random sample of samplesize blocks out of
      29                 :  * the nblocks blocks in the table. If the table has less than
      30                 :  * samplesize blocks, all blocks are selected.
      31                 :  *
      32                 :  * Since we know the total number of blocks in advance, we can use the
      33                 :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
      34                 :  * algorithm.
      35                 :  *
      36                 :  * Returns the number of blocks that BlockSampler_Next will return.
      37                 :  */
      38                 : BlockNumber
      39 CBC       48512 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
      40                 :                   uint32 randseed)
      41                 : {
      42           48512 :     bs->N = nblocks;         /* measured table size */
      43                 : 
      44                 :     /*
      45                 :      * If we decide to reduce samplesize for tables that have less or not much
      46                 :      * more than samplesize blocks, here is the place to do it.
      47                 :      */
      48           48512 :     bs->n = samplesize;
      49           48512 :     bs->t = 0;                   /* blocks scanned so far */
      50           48512 :     bs->m = 0;                   /* blocks selected so far */
      51                 : 
      52           48512 :     sampler_random_init_state(randseed, &bs->randstate);
      53                 : 
      54           48512 :     return Min(bs->n, bs->N);
      55                 : }
      56                 : 
      57                 : bool
      58          691181 : BlockSampler_HasMore(BlockSampler bs)
      59                 : {
      60          691181 :     return (bs->t < bs->N) && (bs->m < bs->n);
      61                 : }
      62                 : 
      63                 : BlockNumber
      64          294716 : BlockSampler_Next(BlockSampler bs)
      65                 : {
      66          294716 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
      67          294716 :     int         k = bs->n - bs->m;    /* blocks still to sample */
      68                 :     double      p;              /* probability to skip block */
      69                 :     double      V;              /* random */
      70                 : 
      71          294716 :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
      72                 : 
      73          294716 :     if ((BlockNumber) k >= K)
      74                 :     {
      75                 :         /* need all the rest */
      76          294716 :         bs->m++;
      77          294716 :         return bs->t++;
      78                 :     }
      79                 : 
      80                 :     /*----------
      81                 :      * It is not obvious that this code matches Knuth's Algorithm S.
      82                 :      * Knuth says to skip the current block with probability 1 - k/K.
      83                 :      * If we are to skip, we should advance t (hence decrease K), and
      84                 :      * repeat the same probabilistic test for the next block.  The naive
      85                 :      * implementation thus requires a sampler_random_fract() call for each
      86                 :      * block number.  But we can reduce this to one sampler_random_fract()
      87                 :      * call per selected block, by noting that each time the while-test
      88                 :      * succeeds, we can reinterpret V as a uniform random number in the range
      89                 :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
      90                 :      * the appropriate fraction of its former value, and our next loop
      91                 :      * makes the appropriate probabilistic test.
      92                 :      *
      93                 :      * We have initially K > k > 0.  If the loop reduces K to equal k,
      94                 :      * the next while-test must fail since p will become exactly zero
      95                 :      * (we assume there will not be roundoff error in the division).
      96                 :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
      97                 :      * to be doubly sure about roundoff error.)  Therefore K cannot become
      98                 :      * less than k, which means that we cannot fail to select enough blocks.
      99                 :      *----------
     100                 :      */
     101 UBC           0 :     V = sampler_random_fract(&bs->randstate);
     102               0 :     p = 1.0 - (double) k / (double) K;
     103               0 :     while (V < p)
     104                 :     {
     105                 :         /* skip */
     106               0 :         bs->t++;
     107               0 :         K--;                    /* keep K == N - t */
     108                 : 
     109                 :         /* adjust p to be new cutoff point in reduced range */
     110               0 :         p *= 1.0 - (double) k / (double) K;
     111                 :     }
     112                 : 
     113                 :     /* select */
     114               0 :     bs->m++;
     115               0 :     return bs->t++;
     116                 : }
     117                 : 
     118                 : /*
     119                 :  * These two routines embody Algorithm Z from "Random sampling with a
     120                 :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
     121                 :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
     122                 :  * of the count S of records to skip before processing another record.
     123                 :  * It is computed primarily based on t, the number of records already read.
     124                 :  * The only extra state needed between calls is W, a random state variable.
     125                 :  *
     126                 :  * reservoir_init_selection_state computes the initial W value.
     127                 :  *
     128                 :  * Given that we've already read t records (t >= n), reservoir_get_next_S
     129                 :  * determines the number of records to skip before the next record is
     130                 :  * processed.
     131                 :  */
     132                 : void
     133 CBC       24296 : reservoir_init_selection_state(ReservoirState rs, int n)
     134                 : {
     135                 :     /*
     136                 :      * Reservoir sampling is not used anywhere where it would need to return
     137                 :      * repeatable results so we can initialize it randomly.
     138                 :      */
     139           24296 :     sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     140                 :                               &rs->randstate);
     141                 : 
     142                 :     /* Initial value of W (for use when Algorithm Z is first applied) */
     143           24296 :     rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
     144           24296 : }
     145                 : 
     146                 : double
     147           57678 : reservoir_get_next_S(ReservoirState rs, double t, int n)
     148                 : {
     149                 :     double      S;
     150                 : 
     151                 :     /* The magic constant here is T from Vitter's paper */
     152           57678 :     if (t <= (22.0 * n))
     153                 :     {
     154                 :         /* Process records using Algorithm X until t is large enough */
     155                 :         double      V,
     156                 :                     quot;
     157                 : 
     158           57678 :         V = sampler_random_fract(&rs->randstate);    /* Generate V */
     159           57678 :         S = 0;
     160           57678 :         t += 1;
     161                 :         /* Note: "num" in Vitter's code is always equal to t - n */
     162           57678 :         quot = (t - (double) n) / t;
     163                 :         /* Find min S satisfying (4.1) */
     164          126325 :         while (quot > V)
     165                 :         {
     166           68647 :             S += 1;
     167           68647 :             t += 1;
     168           68647 :             quot *= (t - (double) n) / t;
     169                 :         }
     170                 :     }
     171                 :     else
     172                 :     {
     173                 :         /* Now apply Algorithm Z */
     174 UBC           0 :         double      W = rs->W;
     175               0 :         double      term = t - (double) n + 1;
     176                 : 
     177                 :         for (;;)
     178               0 :         {
     179                 :             double      numer,
     180                 :                         numer_lim,
     181                 :                         denom;
     182                 :             double      U,
     183                 :                         X,
     184                 :                         lhs,
     185                 :                         rhs,
     186                 :                         y,
     187                 :                         tmp;
     188                 : 
     189                 :             /* Generate U and X */
     190               0 :             U = sampler_random_fract(&rs->randstate);
     191               0 :             X = t * (W - 1.0);
     192               0 :             S = floor(X);       /* S is tentatively set to floor(X) */
     193                 :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
     194               0 :             tmp = (t + 1) / term;
     195               0 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
     196               0 :             rhs = (((t + X) / (term + S)) * term) / t;
     197               0 :             if (lhs <= rhs)
     198                 :             {
     199               0 :                 W = rhs / lhs;
     200               0 :                 break;
     201                 :             }
     202                 :             /* Test if U <= f(S)/cg(X) */
     203               0 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
     204               0 :             if ((double) n < S)
     205                 :             {
     206               0 :                 denom = t;
     207               0 :                 numer_lim = term + S;
     208                 :             }
     209                 :             else
     210                 :             {
     211               0 :                 denom = t - (double) n + S;
     212               0 :                 numer_lim = t + 1;
     213                 :             }
     214               0 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
     215                 :             {
     216               0 :                 y *= numer / denom;
     217               0 :                 denom -= 1;
     218                 :             }
     219               0 :             W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
     220               0 :             if (exp(log(y) / n) <= (t + X) / t)
     221               0 :                 break;
     222                 :         }
     223               0 :         rs->W = W;
     224                 :     }
     225 CBC       57678 :     return S;
     226                 : }
     227                 : 
     228                 : 
     229                 : /*----------
     230                 :  * Random number generator used by sampling
     231                 :  *----------
     232                 :  */
     233                 : void
     234           72818 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
     235                 : {
     236           72818 :     pg_prng_seed(randstate, (uint64) seed);
     237           72818 : }
     238                 : 
     239                 : /* Select a random value R uniformly distributed in (0 - 1) */
     240                 : double
     241          139649 : sampler_random_fract(pg_prng_state *randstate)
     242                 : {
     243                 :     double      res;
     244                 : 
     245                 :     /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
     246                 :     do
     247                 :     {
     248          139649 :         res = pg_prng_double(randstate);
     249          139649 :     } while (unlikely(res == 0.0));
     250          139649 :     return res;
     251                 : }
     252                 : 
     253                 : 
     254                 : /*
     255                 :  * Backwards-compatible API for block sampling
     256                 :  *
     257                 :  * This code is now deprecated, but since it's still in use by many FDWs,
     258                 :  * we should keep it for awhile at least.  The functionality is the same as
     259                 :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
     260                 :  * except that a common random state is used across all callers.
     261                 :  */
     262                 : static ReservoirStateData oldrs;
     263                 : static bool oldrs_initialized = false;
     264                 : 
     265                 : double
     266 UBC           0 : anl_random_fract(void)
     267                 : {
     268                 :     /* initialize if first time through */
     269               0 :     if (unlikely(!oldrs_initialized))
     270                 :     {
     271               0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     272                 :                                   &oldrs.randstate);
     273               0 :         oldrs_initialized = true;
     274                 :     }
     275                 : 
     276                 :     /* and compute a random fraction */
     277               0 :     return sampler_random_fract(&oldrs.randstate);
     278                 : }
     279                 : 
     280                 : double
     281               0 : anl_init_selection_state(int n)
     282                 : {
     283                 :     /* initialize if first time through */
     284               0 :     if (unlikely(!oldrs_initialized))
     285                 :     {
     286               0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
     287                 :                                   &oldrs.randstate);
     288               0 :         oldrs_initialized = true;
     289                 :     }
     290                 : 
     291                 :     /* Initial value of W (for use when Algorithm Z is first applied) */
     292               0 :     return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
     293                 : }
     294                 : 
     295                 : double
     296               0 : anl_get_next_S(double t, int n, double *stateptr)
     297                 : {
     298                 :     double      result;
     299                 : 
     300               0 :     oldrs.W = *stateptr;
     301               0 :     result = reservoir_get_next_S(&oldrs, t, n);
     302               0 :     *stateptr = oldrs.W;
     303               0 :     return result;
     304                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a