LCOV - differential code coverage report
Current view: top level - src/backend/utils/misc - sampling.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage 16@8cea358b128 vs 17@8cea358b128 Lines: 44.2 % 86 38 48 38
Current Date: 2024-04-14 14:21:10 Functions: 70.0 % 10 7 3 7
Baseline: 16@8cea358b128 Branches: 32.1 % 28 9 19 9
Baseline Date: 2024-04-14 14:21:09 Line coverage date bins:
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed (240..) days: 44.2 % 86 38 48 38
Function coverage date bins:
(240..) days: 70.0 % 10 7 3 7
Branch coverage date bins:
(240..) days: 32.1 % 28 9 19 9

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * sampling.c
                                  4                 :                :  *    Relation block sampling routines.
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/utils/misc/sampling.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : 
                                 16                 :                : #include "postgres.h"
                                 17                 :                : 
                                 18                 :                : #include <math.h>
                                 19                 :                : 
                                 20                 :                : #include "utils/sampling.h"
                                 21                 :                : 
                                 22                 :                : 
                                 23                 :                : /*
                                 24                 :                :  * BlockSampler_Init -- prepare for random sampling of blocknumbers
                                 25                 :                :  *
                                 26                 :                :  * BlockSampler provides algorithm for block level sampling of a relation
                                 27                 :                :  * as discussed on pgsql-hackers 2004-04-02 (subject "Large DB")
                                 28                 :                :  * It selects a random sample of samplesize blocks out of
                                 29                 :                :  * the nblocks blocks in the table. If the table has less than
                                 30                 :                :  * samplesize blocks, all blocks are selected.
                                 31                 :                :  *
                                 32                 :                :  * Since we know the total number of blocks in advance, we can use the
                                 33                 :                :  * straightforward Algorithm S from Knuth 3.4.2, rather than Vitter's
                                 34                 :                :  * algorithm.
                                 35                 :                :  *
                                 36                 :                :  * Returns the number of blocks that BlockSampler_Next will return.
                                 37                 :                :  */
                                 38                 :                : BlockNumber
 3257 simon@2ndQuadrant.co       39                 :CBC        6819 : BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, int samplesize,
                                 40                 :                :                   uint32 randseed)
                                 41                 :                : {
                                 42                 :           6819 :     bs->N = nblocks;         /* measured table size */
                                 43                 :                : 
                                 44                 :                :     /*
                                 45                 :                :      * If we decide to reduce samplesize for tables that have less or not much
                                 46                 :                :      * more than samplesize blocks, here is the place to do it.
                                 47                 :                :      */
                                 48                 :           6819 :     bs->n = samplesize;
                                 49                 :           6819 :     bs->t = 0;                   /* blocks scanned so far */
                                 50                 :           6819 :     bs->m = 0;                   /* blocks selected so far */
                                 51                 :                : 
  868 tgl@sss.pgh.pa.us          52                 :           6819 :     sampler_random_init_state(randseed, &bs->randstate);
                                 53                 :                : 
 1551 alvherre@alvh.no-ip.       54                 :           6819 :     return Min(bs->n, bs->N);
                                 55                 :                : }
                                 56                 :                : 
                                 57                 :                : bool
 3257 simon@2ndQuadrant.co       58                 :         129603 : BlockSampler_HasMore(BlockSampler bs)
                                 59                 :                : {
                                 60   [ +  +  +  - ]:         129603 :     return (bs->t < bs->N) && (bs->m < bs->n);
                                 61                 :                : }
                                 62                 :                : 
                                 63                 :                : BlockNumber
                                 64                 :          61392 : BlockSampler_Next(BlockSampler bs)
                                 65                 :                : {
 2489 tgl@sss.pgh.pa.us          66                 :          61392 :     BlockNumber K = bs->N - bs->t;    /* remaining blocks */
                                 67                 :          61392 :     int         k = bs->n - bs->m;    /* blocks still to sample */
                                 68                 :                :     double      p;              /* probability to skip block */
                                 69                 :                :     double      V;              /* random */
                                 70                 :                : 
 3257 simon@2ndQuadrant.co       71         [ -  + ]:          61392 :     Assert(BlockSampler_HasMore(bs));   /* hence K > 0 and k > 0 */
                                 72                 :                : 
                                 73         [ +  - ]:          61392 :     if ((BlockNumber) k >= K)
                                 74                 :                :     {
                                 75                 :                :         /* need all the rest */
                                 76                 :          61392 :         bs->m++;
                                 77                 :          61392 :         return bs->t++;
                                 78                 :                :     }
                                 79                 :                : 
                                 80                 :                :     /*----------
                                 81                 :                :      * It is not obvious that this code matches Knuth's Algorithm S.
                                 82                 :                :      * Knuth says to skip the current block with probability 1 - k/K.
                                 83                 :                :      * If we are to skip, we should advance t (hence decrease K), and
                                 84                 :                :      * repeat the same probabilistic test for the next block.  The naive
                                 85                 :                :      * implementation thus requires a sampler_random_fract() call for each
                                 86                 :                :      * block number.  But we can reduce this to one sampler_random_fract()
                                 87                 :                :      * call per selected block, by noting that each time the while-test
                                 88                 :                :      * succeeds, we can reinterpret V as a uniform random number in the range
                                 89                 :                :      * 0 to p. Therefore, instead of choosing a new V, we just adjust p to be
                                 90                 :                :      * the appropriate fraction of its former value, and our next loop
                                 91                 :                :      * makes the appropriate probabilistic test.
                                 92                 :                :      *
                                 93                 :                :      * We have initially K > k > 0.  If the loop reduces K to equal k,
                                 94                 :                :      * the next while-test must fail since p will become exactly zero
                                 95                 :                :      * (we assume there will not be roundoff error in the division).
                                 96                 :                :      * (Note: Knuth suggests a "<=" loop condition, but we use "<" just
                                 97                 :                :      * to be doubly sure about roundoff error.)  Therefore K cannot become
                                 98                 :                :      * less than k, which means that we cannot fail to select enough blocks.
                                 99                 :                :      *----------
                                100                 :                :      */
  868 tgl@sss.pgh.pa.us         101                 :UBC           0 :     V = sampler_random_fract(&bs->randstate);
 3257 simon@2ndQuadrant.co      102                 :              0 :     p = 1.0 - (double) k / (double) K;
                                103         [ #  # ]:              0 :     while (V < p)
                                104                 :                :     {
                                105                 :                :         /* skip */
                                106                 :              0 :         bs->t++;
                                107                 :              0 :         K--;                    /* keep K == N - t */
                                108                 :                : 
                                109                 :                :         /* adjust p to be new cutoff point in reduced range */
                                110                 :              0 :         p *= 1.0 - (double) k / (double) K;
                                111                 :                :     }
                                112                 :                : 
                                113                 :                :     /* select */
                                114                 :              0 :     bs->m++;
                                115                 :              0 :     return bs->t++;
                                116                 :                : }
                                117                 :                : 
                                118                 :                : /*
                                119                 :                :  * These two routines embody Algorithm Z from "Random sampling with a
                                120                 :                :  * reservoir" by Jeffrey S. Vitter, in ACM Trans. Math. Softw. 11, 1
                                121                 :                :  * (Mar. 1985), Pages 37-57.  Vitter describes his algorithm in terms
                                122                 :                :  * of the count S of records to skip before processing another record.
                                123                 :                :  * It is computed primarily based on t, the number of records already read.
                                124                 :                :  * The only extra state needed between calls is W, a random state variable.
                                125                 :                :  *
                                126                 :                :  * reservoir_init_selection_state computes the initial W value.
                                127                 :                :  *
                                128                 :                :  * Given that we've already read t records (t >= n), reservoir_get_next_S
                                129                 :                :  * determines the number of records to skip before the next record is
                                130                 :                :  * processed.
                                131                 :                :  */
                                132                 :                : void
 3257 simon@2ndQuadrant.co      133                 :CBC        6859 : reservoir_init_selection_state(ReservoirState rs, int n)
                                134                 :                : {
                                135                 :                :     /*
                                136                 :                :      * Reservoir sampling is not used anywhere where it would need to return
                                137                 :                :      * repeatable results so we can initialize it randomly.
                                138                 :                :      */
  868 tgl@sss.pgh.pa.us         139                 :           6859 :     sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                140                 :                :                               &rs->randstate);
                                141                 :                : 
                                142                 :                :     /* Initial value of W (for use when Algorithm Z is first applied) */
                                143                 :           6859 :     rs->W = exp(-log(sampler_random_fract(&rs->randstate)) / n);
 3257 simon@2ndQuadrant.co      144                 :           6859 : }
                                145                 :                : 
                                146                 :                : double
                                147                 :          57702 : reservoir_get_next_S(ReservoirState rs, double t, int n)
                                148                 :                : {
                                149                 :                :     double      S;
                                150                 :                : 
                                151                 :                :     /* The magic constant here is T from Vitter's paper */
                                152         [ +  - ]:          57702 :     if (t <= (22.0 * n))
                                153                 :                :     {
                                154                 :                :         /* Process records using Algorithm X until t is large enough */
                                155                 :                :         double      V,
                                156                 :                :                     quot;
                                157                 :                : 
  868 tgl@sss.pgh.pa.us         158                 :          57702 :         V = sampler_random_fract(&rs->randstate);    /* Generate V */
 3257 simon@2ndQuadrant.co      159                 :          57702 :         S = 0;
                                160                 :          57702 :         t += 1;
                                161                 :                :         /* Note: "num" in Vitter's code is always equal to t - n */
                                162                 :          57702 :         quot = (t - (double) n) / t;
                                163                 :                :         /* Find min S satisfying (4.1) */
                                164         [ +  + ]:         126799 :         while (quot > V)
                                165                 :                :         {
                                166                 :          69097 :             S += 1;
                                167                 :          69097 :             t += 1;
                                168                 :          69097 :             quot *= (t - (double) n) / t;
                                169                 :                :         }
                                170                 :                :     }
                                171                 :                :     else
                                172                 :                :     {
                                173                 :                :         /* Now apply Algorithm Z */
 3257 simon@2ndQuadrant.co      174                 :UBC           0 :         double      W = rs->W;
                                175                 :              0 :         double      term = t - (double) n + 1;
                                176                 :                : 
                                177                 :                :         for (;;)
                                178                 :              0 :         {
                                179                 :                :             double      numer,
                                180                 :                :                         numer_lim,
                                181                 :                :                         denom;
                                182                 :                :             double      U,
                                183                 :                :                         X,
                                184                 :                :                         lhs,
                                185                 :                :                         rhs,
                                186                 :                :                         y,
                                187                 :                :                         tmp;
                                188                 :                : 
                                189                 :                :             /* Generate U and X */
  868 tgl@sss.pgh.pa.us         190                 :              0 :             U = sampler_random_fract(&rs->randstate);
 3257 simon@2ndQuadrant.co      191                 :              0 :             X = t * (W - 1.0);
                                192                 :              0 :             S = floor(X);       /* S is tentatively set to floor(X) */
                                193                 :                :             /* Test if U <= h(S)/cg(X) in the manner of (6.3) */
                                194                 :              0 :             tmp = (t + 1) / term;
                                195                 :              0 :             lhs = exp(log(((U * tmp * tmp) * (term + S)) / (t + X)) / n);
                                196                 :              0 :             rhs = (((t + X) / (term + S)) * term) / t;
                                197         [ #  # ]:              0 :             if (lhs <= rhs)
                                198                 :                :             {
                                199                 :              0 :                 W = rhs / lhs;
                                200                 :              0 :                 break;
                                201                 :                :             }
                                202                 :                :             /* Test if U <= f(S)/cg(X) */
                                203                 :              0 :             y = (((U * (t + 1)) / term) * (t + S + 1)) / (t + X);
                                204         [ #  # ]:              0 :             if ((double) n < S)
                                205                 :                :             {
                                206                 :              0 :                 denom = t;
                                207                 :              0 :                 numer_lim = term + S;
                                208                 :                :             }
                                209                 :                :             else
                                210                 :                :             {
                                211                 :              0 :                 denom = t - (double) n + S;
                                212                 :              0 :                 numer_lim = t + 1;
                                213                 :                :             }
                                214         [ #  # ]:              0 :             for (numer = t + S; numer >= numer_lim; numer -= 1)
                                215                 :                :             {
                                216                 :              0 :                 y *= numer / denom;
                                217                 :              0 :                 denom -= 1;
                                218                 :                :             }
  868 tgl@sss.pgh.pa.us         219                 :              0 :             W = exp(-log(sampler_random_fract(&rs->randstate)) / n); /* Generate W in advance */
 3257 simon@2ndQuadrant.co      220         [ #  # ]:              0 :             if (exp(log(y) / n) <= (t + X) / t)
                                221                 :              0 :                 break;
                                222                 :                :         }
                                223                 :              0 :         rs->W = W;
                                224                 :                :     }
 3257 simon@2ndQuadrant.co      225                 :CBC       57702 :     return S;
                                226                 :                : }
                                227                 :                : 
                                228                 :                : 
                                229                 :                : /*----------
                                230                 :                :  * Random number generator used by sampling
                                231                 :                :  *----------
                                232                 :                :  */
                                233                 :                : void
  868 tgl@sss.pgh.pa.us         234                 :          13688 : sampler_random_init_state(uint32 seed, pg_prng_state *randstate)
                                235                 :                : {
                                236                 :          13688 :     pg_prng_seed(randstate, (uint64) seed);
 3257 simon@2ndQuadrant.co      237                 :          13688 : }
                                238                 :                : 
                                239                 :                : /* Select a random value R uniformly distributed in (0 - 1) */
                                240                 :                : double
  868 tgl@sss.pgh.pa.us         241                 :         122258 : sampler_random_fract(pg_prng_state *randstate)
                                242                 :                : {
                                243                 :                :     double      res;
                                244                 :                : 
                                245                 :                :     /* pg_prng_double returns a value in [0.0 - 1.0), so we must reject 0.0 */
                                246                 :                :     do
                                247                 :                :     {
                                248                 :         122258 :         res = pg_prng_double(randstate);
                                249         [ -  + ]:         122258 :     } while (unlikely(res == 0.0));
 3210                           250                 :         122258 :     return res;
                                251                 :                : }
                                252                 :                : 
                                253                 :                : 
                                254                 :                : /*
                                255                 :                :  * Backwards-compatible API for block sampling
                                256                 :                :  *
                                257                 :                :  * This code is now deprecated, but since it's still in use by many FDWs,
                                258                 :                :  * we should keep it for awhile at least.  The functionality is the same as
                                259                 :                :  * sampler_random_fract/reservoir_init_selection_state/reservoir_get_next_S,
                                260                 :                :  * except that a common random state is used across all callers.
                                261                 :                :  */
                                262                 :                : static ReservoirStateData oldrs;
                                263                 :                : static bool oldrs_initialized = false;
                                264                 :                : 
                                265                 :                : double
 3254 tgl@sss.pgh.pa.us         266                 :UBC           0 : anl_random_fract(void)
                                267                 :                : {
                                268                 :                :     /* initialize if first time through */
  868                           269         [ #  # ]:              0 :     if (unlikely(!oldrs_initialized))
                                270                 :                :     {
                                271                 :              0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                272                 :                :                                   &oldrs.randstate);
                                273                 :              0 :         oldrs_initialized = true;
                                274                 :                :     }
                                275                 :                : 
                                276                 :                :     /* and compute a random fraction */
                                277                 :              0 :     return sampler_random_fract(&oldrs.randstate);
                                278                 :                : }
                                279                 :                : 
                                280                 :                : double
 3254                           281                 :              0 : anl_init_selection_state(int n)
                                282                 :                : {
                                283                 :                :     /* initialize if first time through */
  868                           284         [ #  # ]:              0 :     if (unlikely(!oldrs_initialized))
                                285                 :                :     {
                                286                 :              0 :         sampler_random_init_state(pg_prng_uint32(&pg_global_prng_state),
                                287                 :                :                                   &oldrs.randstate);
                                288                 :              0 :         oldrs_initialized = true;
                                289                 :                :     }
                                290                 :                : 
                                291                 :                :     /* Initial value of W (for use when Algorithm Z is first applied) */
                                292                 :              0 :     return exp(-log(sampler_random_fract(&oldrs.randstate)) / n);
                                293                 :                : }
                                294                 :                : 
                                295                 :                : double
 3254                           296                 :              0 : anl_get_next_S(double t, int n, double *stateptr)
                                297                 :                : {
                                298                 :                :     double      result;
                                299                 :                : 
                                300                 :              0 :     oldrs.W = *stateptr;
                                301                 :              0 :     result = reservoir_get_next_S(&oldrs, t, n);
                                302                 :              0 :     *stateptr = oldrs.W;
                                303                 :              0 :     return result;
                                304                 :                : }
        

Generated by: LCOV version 2.1-beta2-3-g6141622