LCOV - differential code coverage report
Current view: top level - src/port - pg_popcount_avx512.c (source / functions) Coverage Total Hit UNC
Current: Differential Code Coverage 16@8cea358b128 vs 17@8cea358b128 Lines: 0.0 % 48 0 48
Current Date: 2024-04-14 14:21:10 Functions: 0.0 % 2 0 2
Baseline: 16@8cea358b128 Branches: 0.0 % 8 0 8
Baseline Date: 2024-04-14 14:21:09 Line coverage date bins:
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed [..60] days: 0.0 % 48 0 48
Function coverage date bins:
[..60] days: 0.0 % 2 0 2
Branch coverage date bins:
[..60] days: 0.0 % 8 0 8

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * pg_popcount_avx512.c
                                  4                 :                :  *    Holds the AVX-512 pg_popcount() implementation.
                                  5                 :                :  *
                                  6                 :                :  * Copyright (c) 2024, PostgreSQL Global Development Group
                                  7                 :                :  *
                                  8                 :                :  * IDENTIFICATION
                                  9                 :                :  *    src/port/pg_popcount_avx512.c
                                 10                 :                :  *
                                 11                 :                :  *-------------------------------------------------------------------------
                                 12                 :                :  */
                                 13                 :                : #include "c.h"
                                 14                 :                : 
                                 15                 :                : #include <immintrin.h>
                                 16                 :                : 
                                 17                 :                : #include "port/pg_bitutils.h"
                                 18                 :                : 
                                 19                 :                : /*
                                 20                 :                :  * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
                                 21                 :                :  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
                                 22                 :                :  * the function pointers that are only used when TRY_POPCNT_FAST is set.
                                 23                 :                :  */
                                 24                 :                : #ifdef TRY_POPCNT_FAST
                                 25                 :                : 
                                 26                 :                : /*
                                 27                 :                :  * pg_popcount_avx512
                                 28                 :                :  *      Returns the number of 1-bits in buf
                                 29                 :                :  */
                                 30                 :                : uint64
    8 nathan@postgresql.or       31                 :UNC           0 : pg_popcount_avx512(const char *buf, int bytes)
                                 32                 :                : {
                                 33                 :                :     __m512i     val,
                                 34                 :                :                 cnt;
                                 35                 :              0 :     __m512i     accum = _mm512_setzero_si512();
                                 36                 :                :     const char *final;
                                 37                 :                :     int         tail_idx;
                                 38                 :              0 :     __mmask64   mask = ~UINT64CONST(0);
                                 39                 :                : 
                                 40                 :                :     /*
                                 41                 :                :      * Align buffer down to avoid double load overhead from unaligned access.
                                 42                 :                :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
                                 43                 :                :      * iteration and ensure it is not empty.
                                 44                 :                :      */
                                 45                 :              0 :     mask <<= ((uintptr_t) buf) % sizeof(__m512i);
                                 46                 :              0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
                                 47                 :              0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
                                 48                 :              0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
                                 49                 :                : 
                                 50                 :                :     /*
                                 51                 :                :      * Iterate through all but the final iteration.  Starting from the second
                                 52                 :                :      * iteration, the mask is ignored.
                                 53                 :                :      */
                                 54         [ #  # ]:              0 :     if (buf < final)
                                 55                 :                :     {
                                 56                 :              0 :         val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
                                 57                 :              0 :         cnt = _mm512_popcnt_epi64(val);
                                 58                 :              0 :         accum = _mm512_add_epi64(accum, cnt);
                                 59                 :                : 
                                 60                 :              0 :         buf += sizeof(__m512i);
                                 61                 :              0 :         mask = ~UINT64CONST(0);
                                 62                 :                : 
                                 63         [ #  # ]:              0 :         for (; buf < final; buf += sizeof(__m512i))
                                 64                 :                :         {
                                 65                 :              0 :             val = _mm512_load_si512((const __m512i *) buf);
                                 66                 :              0 :             cnt = _mm512_popcnt_epi64(val);
                                 67                 :              0 :             accum = _mm512_add_epi64(accum, cnt);
                                 68                 :                :         }
                                 69                 :                :     }
                                 70                 :                : 
                                 71                 :                :     /* Final iteration needs to ignore bytes that are not within the length */
                                 72                 :              0 :     mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
                                 73                 :                : 
                                 74                 :              0 :     val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
                                 75                 :              0 :     cnt = _mm512_popcnt_epi64(val);
                                 76                 :              0 :     accum = _mm512_add_epi64(accum, cnt);
                                 77                 :                : 
                                 78                 :              0 :     return _mm512_reduce_add_epi64(accum);
                                 79                 :                : }
                                 80                 :                : 
                                 81                 :                : /*
                                 82                 :                :  * pg_popcount_masked_avx512
                                 83                 :                :  *      Returns the number of 1-bits in buf after applying the mask to each byte
                                 84                 :                :  */
                                 85                 :                : uint64
                                 86                 :              0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
                                 87                 :                : {
                                 88                 :                :     __m512i     val,
                                 89                 :                :                 vmasked,
                                 90                 :                :                 cnt;
                                 91                 :              0 :     __m512i     accum = _mm512_setzero_si512();
                                 92                 :                :     const char *final;
                                 93                 :                :     int         tail_idx;
                                 94                 :              0 :     __mmask64   bmask = ~UINT64CONST(0);
                                 95                 :              0 :     const       __m512i maskv = _mm512_set1_epi8(mask);
                                 96                 :                : 
                                 97                 :                :     /*
                                 98                 :                :      * Align buffer down to avoid double load overhead from unaligned access.
                                 99                 :                :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
                                100                 :                :      * iteration and ensure it is not empty.
                                101                 :                :      */
                                102                 :              0 :     bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
                                103                 :              0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
                                104                 :              0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
                                105                 :              0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
                                106                 :                : 
                                107                 :                :     /*
                                108                 :                :      * Iterate through all but the final iteration.  Starting from the second
                                109                 :                :      * iteration, the mask is ignored.
                                110                 :                :      */
                                111         [ #  # ]:              0 :     if (buf < final)
                                112                 :                :     {
                                113                 :              0 :         val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
                                114                 :              0 :         vmasked = _mm512_and_si512(val, maskv);
                                115                 :              0 :         cnt = _mm512_popcnt_epi64(vmasked);
                                116                 :              0 :         accum = _mm512_add_epi64(accum, cnt);
                                117                 :                : 
                                118                 :              0 :         buf += sizeof(__m512i);
                                119                 :              0 :         bmask = ~UINT64CONST(0);
                                120                 :                : 
                                121         [ #  # ]:              0 :         for (; buf < final; buf += sizeof(__m512i))
                                122                 :                :         {
                                123                 :              0 :             val = _mm512_load_si512((const __m512i *) buf);
                                124                 :              0 :             vmasked = _mm512_and_si512(val, maskv);
                                125                 :              0 :             cnt = _mm512_popcnt_epi64(vmasked);
                                126                 :              0 :             accum = _mm512_add_epi64(accum, cnt);
                                127                 :                :         }
                                128                 :                :     }
                                129                 :                : 
                                130                 :                :     /* Final iteration needs to ignore bytes that are not within the length */
                                131                 :              0 :     bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
                                132                 :                : 
                                133                 :              0 :     val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
                                134                 :              0 :     vmasked = _mm512_and_si512(val, maskv);
                                135                 :              0 :     cnt = _mm512_popcnt_epi64(vmasked);
                                136                 :              0 :     accum = _mm512_add_epi64(accum, cnt);
                                137                 :                : 
                                138                 :              0 :     return _mm512_reduce_add_epi64(accum);
                                139                 :                : }
                                140                 :                : 
                                141                 :                : #endif                          /* TRY_POPCNT_FAST */
        

Generated by: LCOV version 2.1-beta2-3-g6141622