LCOV - differential code coverage report
Current view: top level - src/backend/lib - hyperloglog.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage HEAD vs 15 Lines: 69.8 % 63 44 19 44
Current Date: 2023-04-08 15:15:32 Functions: 83.3 % 6 5 1 5
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * hyperloglog.c
       4                 :  *    HyperLogLog cardinality estimator
       5                 :  *
       6                 :  * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
       7                 :  *
       8                 :  * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
       9                 :  * suited to estimating the cardinality of very large sets;  in particular, we
      10                 :  * have not attempted to further optimize the implementation as described in
      11                 :  * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
      12                 :  * Engineering of a State of The Art Cardinality Estimation Algorithm".
      13                 :  *
      14                 :  * A sparse representation of HyperLogLog state is used, with fixed space
      15                 :  * overhead.
      16                 :  *
      17                 :  * The copyright terms of Ohno's original version (the MIT license) follow.
      18                 :  *
      19                 :  * IDENTIFICATION
      20                 :  *    src/backend/lib/hyperloglog.c
      21                 :  *
      22                 :  *-------------------------------------------------------------------------
      23                 :  */
      24                 : 
      25                 : /*
      26                 :  * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
      27                 :  *
      28                 :  * Permission is hereby granted, free of charge, to any person obtaining a copy
      29                 :  * of this software and associated documentation files (the 'Software'), to
      30                 :  * deal in the Software without restriction, including without limitation the
      31                 :  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
      32                 :  * sell copies of the Software, and to permit persons to whom the Software is
      33                 :  * furnished to do so, subject to the following conditions:
      34                 :  *
      35                 :  * The above copyright notice and this permission notice shall be included in
      36                 :  * all copies or substantial portions of the Software.
      37                 :  *
      38                 :  * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
      39                 :  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
      40                 :  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
      41                 :  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
      42                 :  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
      43                 :  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
      44                 :  * IN THE SOFTWARE.
      45                 :  */
      46                 : 
      47                 : #include "postgres.h"
      48                 : 
      49                 : #include <math.h>
      50                 : 
      51                 : #include "lib/hyperloglog.h"
      52                 : #include "port/pg_bitutils.h"
      53                 : 
      54                 : #define POW_2_32            (4294967296.0)
      55                 : #define NEG_POW_2_32        (-4294967296.0)
      56                 : 
      57                 : static inline uint8 rho(uint32 x, uint8 b);
      58                 : 
      59                 : /*
      60                 :  * Initialize HyperLogLog track state, by bit width
      61                 :  *
      62                 :  * bwidth is bit width (so register size will be 2 to the power of bwidth).
      63                 :  * Must be between 4 and 16 inclusive.
      64                 :  */
      65                 : void
      66 CBC       73828 : initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
      67                 : {
      68                 :     double      alpha;
      69                 : 
      70           73828 :     if (bwidth < 4 || bwidth > 16)
      71 UBC           0 :         elog(ERROR, "bit width must be between 4 and 16 inclusive");
      72                 : 
      73 CBC       73828 :     cState->registerWidth = bwidth;
      74           73828 :     cState->nRegisters = (Size) 1 << bwidth;
      75           73828 :     cState->arrSize = sizeof(uint8) * cState->nRegisters + 1;
      76                 : 
      77                 :     /*
      78                 :      * Initialize hashes array to zero, not negative infinity, per discussion
      79                 :      * of the coupon collector problem in the HyperLogLog paper
      80                 :      */
      81           73828 :     cState->hashesArr = palloc0(cState->arrSize);
      82                 : 
      83                 :     /*
      84                 :      * "alpha" is a value that for each possible number of registers (m) is
      85                 :      * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
      86                 :      * is "the indicator function" through which we finally compute E,
      87                 :      * estimated cardinality).
      88                 :      */
      89           73828 :     switch (cState->nRegisters)
      90                 :     {
      91 UBC           0 :         case 16:
      92               0 :             alpha = 0.673;
      93               0 :             break;
      94 CBC       25272 :         case 32:
      95           25272 :             alpha = 0.697;
      96           25272 :             break;
      97 UBC           0 :         case 64:
      98               0 :             alpha = 0.709;
      99               0 :             break;
     100 CBC       48556 :         default:
     101           48556 :             alpha = 0.7213 / (1.0 + 1.079 / cState->nRegisters);
     102                 :     }
     103                 : 
     104                 :     /*
     105                 :      * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
     106                 :      * estimate E
     107                 :      */
     108           73828 :     cState->alphaMM = alpha * cState->nRegisters * cState->nRegisters;
     109           73828 : }
     110                 : 
     111                 : /*
     112                 :  * Initialize HyperLogLog track state, by error rate
     113                 :  *
     114                 :  * Instead of specifying bwidth (number of bits used for addressing the
     115                 :  * register), this method allows sizing the counter for particular error
     116                 :  * rate using a simple formula from the paper:
     117                 :  *
     118                 :  *   e = 1.04 / sqrt(m)
     119                 :  *
     120                 :  * where 'm' is the number of registers, i.e. (2^bwidth). The method
     121                 :  * finds the lowest bwidth with 'e' below the requested error rate, and
     122                 :  * then uses it to initialize the counter.
     123                 :  *
     124                 :  * As bwidth has to be between 4 and 16, the worst possible error rate
     125                 :  * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
     126                 :  */
     127                 : void
     128 UBC           0 : initHyperLogLogError(hyperLogLogState *cState, double error)
     129                 : {
     130               0 :     uint8       bwidth = 4;
     131                 : 
     132               0 :     while (bwidth < 16)
     133                 :     {
     134               0 :         double      m = (Size) 1 << bwidth;
     135                 : 
     136               0 :         if (1.04 / sqrt(m) < error)
     137               0 :             break;
     138               0 :         bwidth++;
     139                 :     }
     140                 : 
     141               0 :     initHyperLogLog(cState, bwidth);
     142               0 : }
     143                 : 
     144                 : /*
     145                 :  * Free HyperLogLog track state
     146                 :  *
     147                 :  * Releases allocated resources, but not the state itself (in case it's not
     148                 :  * allocated by palloc).
     149                 :  */
     150                 : void
     151 CBC       13506 : freeHyperLogLog(hyperLogLogState *cState)
     152                 : {
     153           13506 :     Assert(cState->hashesArr != NULL);
     154           13506 :     pfree(cState->hashesArr);
     155           13506 : }
     156                 : 
     157                 : /*
     158                 :  * Adds element to the estimator, from caller-supplied hash.
     159                 :  *
     160                 :  * It is critical that the hash value passed be an actual hash value, typically
     161                 :  * generated using hash_any().  The algorithm relies on a specific bit-pattern
     162                 :  * observable in conjunction with stochastic averaging.  There must be a
     163                 :  * uniform distribution of bits in hash values for each distinct original value
     164                 :  * observed.
     165                 :  */
     166                 : void
     167         2878529 : addHyperLogLog(hyperLogLogState *cState, uint32 hash)
     168                 : {
     169                 :     uint8       count;
     170                 :     uint32      index;
     171                 : 
     172                 :     /* Use the first "k" (registerWidth) bits as a zero based index */
     173         2878529 :     index = hash >> (BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
     174                 : 
     175                 :     /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
     176         2878529 :     count = rho(hash << cState->registerWidth,
     177         2878529 :                 BITS_PER_BYTE * sizeof(uint32) - cState->registerWidth);
     178                 : 
     179         2878529 :     cState->hashesArr[index] = Max(count, cState->hashesArr[index]);
     180         2878529 : }
     181                 : 
     182                 : /*
     183                 :  * Estimates cardinality, based on elements added so far
     184                 :  */
     185                 : double
     186           15046 : estimateHyperLogLog(hyperLogLogState *cState)
     187                 : {
     188                 :     double      result;
     189           15046 :     double      sum = 0.0;
     190                 :     int         i;
     191                 : 
     192         2024198 :     for (i = 0; i < cState->nRegisters; i++)
     193                 :     {
     194         2009152 :         sum += 1.0 / pow(2.0, cState->hashesArr[i]);
     195                 :     }
     196                 : 
     197                 :     /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
     198           15046 :     result = cState->alphaMM / sum;
     199                 : 
     200           15046 :     if (result <= (5.0 / 2.0) * cState->nRegisters)
     201                 :     {
     202                 :         /* Small range correction */
     203           14649 :         int         zero_count = 0;
     204                 : 
     205         1935705 :         for (i = 0; i < cState->nRegisters; i++)
     206                 :         {
     207         1921056 :             if (cState->hashesArr[i] == 0)
     208         1527909 :                 zero_count++;
     209                 :         }
     210                 : 
     211           14649 :         if (zero_count != 0)
     212           14649 :             result = cState->nRegisters * log((double) cState->nRegisters /
     213                 :                                               zero_count);
     214                 :     }
     215             397 :     else if (result > (1.0 / 30.0) * POW_2_32)
     216                 :     {
     217                 :         /* Large range correction */
     218 UBC           0 :         result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
     219                 :     }
     220                 : 
     221 CBC       15046 :     return result;
     222                 : }
     223                 : 
     224                 : /*
     225                 :  * Worker for addHyperLogLog().
     226                 :  *
     227                 :  * Calculates the position of the first set bit in first b bits of x argument
     228                 :  * starting from the first, reading from most significant to least significant
     229                 :  * bits.
     230                 :  *
     231                 :  * Example (when considering fist 10 bits of x):
     232                 :  *
     233                 :  * rho(x = 0b1000000000)   returns 1
     234                 :  * rho(x = 0b0010000000)   returns 3
     235                 :  * rho(x = 0b0000000000)   returns b + 1
     236                 :  *
     237                 :  * "The binary address determined by the first b bits of x"
     238                 :  *
     239                 :  * Return value "j" used to index bit pattern to watch.
     240                 :  */
     241                 : static inline uint8
     242         2878529 : rho(uint32 x, uint8 b)
     243                 : {
     244         2878529 :     uint8       j = 1;
     245                 : 
     246         2878529 :     if (x == 0)
     247 UBC           0 :         return b + 1;
     248                 : 
     249 CBC     2878529 :     j = 32 - pg_leftmost_one_pos32(x);
     250                 : 
     251         2878529 :     if (j > b)
     252 UBC           0 :         return b + 1;
     253                 : 
     254 CBC     2878529 :     return j;
     255                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a