LCOV - Differential Code Coverage HEAD vs 15 - src/backend/access/nbtree/nbtdedup.c

LCOV - differential code coverage report

Current view:	top level - src/backend/access/nbtree - nbtdedup.c (source / functions)		Coverage	Total	Hit	LBC	UIC	UBC	GBC	GIC	GNC	CBC	EUB	ECB
Current:	Differential Code Coverage HEAD vs 15	Lines:	96.8 %	346	335	3	7	1	2	155	3	175	8	157
Current Date:	2023-04-08 17:13:01	Functions:	100.0 %	12	12					11	1			11
Baseline:	15	Line coverage date bins:
Baseline Date:	2023-04-08 15:09:40	(240..) days:	96.8 %	346	335	3	7	1	2	155	3	175	8	156
Legend:	Lines: hit not hit	Function coverage date bins:
		(240..) days:	52.2 %	23	12					11	1			11

 Age         Owner                  TLA  Line data    Source code

                                  1                 : /*-------------------------------------------------------------------------
                                  2                 :  *
                                  3                 :  * nbtdedup.c
                                  4                 :  *    Deduplicate or bottom-up delete items in Postgres btrees.
                                  5                 :  *
                                  6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
                                  7                 :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :  *
                                  9                 :  *
                                 10                 :  * IDENTIFICATION
                                 11                 :  *    src/backend/access/nbtree/nbtdedup.c
                                 12                 :  *
                                 13                 :  *-------------------------------------------------------------------------
                                 14                 :  */
                                 15                 : #include "postgres.h"
                                 16                 : 
                                 17                 : #include "access/nbtree.h"
                                 18                 : #include "access/nbtxlog.h"
                                 19                 : #include "access/xloginsert.h"
                                 20                 : #include "miscadmin.h"
                                 21                 : #include "utils/rel.h"
                                 22                 : 
                                 23                 : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                 24                 :                                            TM_IndexDeleteOp *delstate);
                                 25                 : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                 26                 :                              OffsetNumber minoff, IndexTuple newitem);
                                 27                 : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
                                 28                 :                                      Size newitemsz);
                                 29                 : #ifdef USE_ASSERT_CHECKING
                                 30                 : static bool _bt_posting_valid(IndexTuple posting);
                                 31                 : #endif
                                 32                 : 
                                 33                 : /*
                                 34                 :  * Perform a deduplication pass.
                                 35                 :  *
                                 36                 :  * The general approach taken here is to perform as much deduplication as
                                 37                 :  * possible to free as much space as possible.  Note, however, that "single
                                 38                 :  * value" strategy is used for !bottomupdedup callers when the page is full of
                                 39                 :  * tuples of a single value.  Deduplication passes that apply the strategy
                                 40                 :  * will leave behind a few untouched tuples at the end of the page, preparing
                                 41                 :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
                                 42                 :  * value strategy.  Our high level goal is to delay merging the untouched
                                 43                 :  * tuples until after the page splits.
                                 44                 :  *
                                 45                 :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
                                 46                 :  * high level goal is to prevent a page split entirely by buying more time.
                                 47                 :  * We still hope that a page split can be avoided altogether.  That's why
                                 48                 :  * single value strategy is not even considered for bottomupdedup callers.
                                 49                 :  *
                                 50                 :  * The page will have to be split if we cannot successfully free at least
                                 51                 :  * newitemsz (we also need space for newitem's line pointer, which isn't
                                 52                 :  * included in caller's newitemsz).
                                 53                 :  *
                                 54                 :  * Note: Caller should have already deleted all existing items with their
                                 55                 :  * LP_DEAD bits set.
                                 56                 :  */
                                 57                 : void
  873 pg                         58 CBC       28659 : _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
                                 59                 :                Size newitemsz, bool bottomupdedup)
                                 60                 : {
                                 61                 :     OffsetNumber offnum,
                                 62                 :                 minoff,
                                 63                 :                 maxoff;
 1138                            64           28659 :     Page        page = BufferGetPage(buf);
  373 michael                    65           28659 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                                 66                 :     Page        newpage;
                                 67                 :     BTDedupState state;
  368 tgl                        68           28659 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
 1138 pg                         69           28659 :     bool        singlevalstrat = false;
 1107                            70           28659 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                 71                 : 
                                 72                 :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 1138                            73           28659 :     newitemsz += sizeof(ItemIdData);
                                 74                 : 
                                 75                 :     /*
                                 76                 :      * Initialize deduplication state.
                                 77                 :      *
                                 78                 :      * It would be possible for maxpostingsize (limit on posting list tuple
                                 79                 :      * size) to be set to one third of the page.  However, it seems like a
                                 80                 :      * good idea to limit the size of posting lists to one sixth of a page.
                                 81                 :      * That ought to leave us with a good split point when pages full of
                                 82                 :      * duplicates can be split several times.
                                 83                 :      */
                                 84           28659 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
                                 85           28659 :     state->deduplicate = true;
 1024                            86           28659 :     state->nmaxitems = 0;
 1138                            87           28659 :     state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
                                 88                 :     /* Metadata about base tuple of current pending posting list */
                                 89           28659 :     state->base = NULL;
                                 90           28659 :     state->baseoff = InvalidOffsetNumber;
                                 91           28659 :     state->basetupsize = 0;
                                 92                 :     /* Metadata about current pending posting list TIDs */
                                 93           28659 :     state->htids = palloc(state->maxpostingsize);
                                 94           28659 :     state->nhtids = 0;
                                 95           28659 :     state->nitems = 0;
                                 96                 :     /* Size of all physical tuples to be replaced by pending posting list */
                                 97           28659 :     state->phystupsize = 0;
                                 98                 :     /* nintervals should be initialized to zero */
                                 99           28659 :     state->nintervals = 0;
                                100                 : 
  873                           101           28659 :     minoff = P_FIRSTDATAKEY(opaque);
                                102           28659 :     maxoff = PageGetMaxOffsetNumber(page);
                                103                 : 
                                104                 :     /*
                                105                 :      * Consider applying "single value" strategy, though only if the page
                                106                 :      * seems likely to be split in the near future
                                107                 :      */
  565                           108           28659 :     if (!bottomupdedup)
 1138                           109           25291 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
                                110                 : 
                                111                 :     /*
                                112                 :      * Deduplicate items from page, and write them to newpage.
                                113                 :      *
                                114                 :      * Copy the original page's LSN into newpage copy.  This will become the
                                115                 :      * updated version of the page.  We need this because XLogInsert will
                                116                 :      * examine the LSN and possibly dump it in a page image.
                                117                 :      */
                                118           28659 :     newpage = PageGetTempPageCopySpecial(page);
                                119           28659 :     PageSetLSN(newpage, PageGetLSN(page));
                                120                 : 
                                121                 :     /* Copy high key, if any */
                                122           28659 :     if (!P_RIGHTMOST(opaque))
                                123                 :     {
                                124           21027 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
                                125           21027 :         Size        hitemsz = ItemIdGetLength(hitemid);
                                126           21027 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
                                127                 : 
                                128           21027 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
                                129                 :                         false, false) == InvalidOffsetNumber)
 1138 pg                        130 UBC           0 :             elog(ERROR, "deduplication failed to add highkey");
                                131                 :     }
                                132                 : 
 1138 pg                        133 CBC       28659 :     for (offnum = minoff;
                                134         6486288 :          offnum <= maxoff;
                                135         6457629 :          offnum = OffsetNumberNext(offnum))
                                136                 :     {
                                137         6457629 :         ItemId      itemid = PageGetItemId(page, offnum);
                                138         6457629 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                139                 : 
                                140         6457629 :         Assert(!ItemIdIsDead(itemid));
                                141                 : 
                                142         6457629 :         if (offnum == minoff)
                                143                 :         {
                                144                 :             /*
                                145                 :              * No previous/base tuple for the data item -- use the data item
                                146                 :              * as base tuple of pending posting list
                                147                 :              */
                                148           28659 :             _bt_dedup_start_pending(state, itup, offnum);
                                149                 :         }
                                150        12856912 :         else if (state->deduplicate &&
 1107                           151         7275837 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
 1138                           152          847895 :                  _bt_dedup_save_htid(state, itup))
                                153                 :         {
                                154                 :             /*
                                155                 :              * Tuple is equal to base tuple of pending posting list.  Heap
                                156                 :              * TID(s) for itup have been saved in state.
                                157                 :              */
                                158                 :         }
                                159                 :         else
                                160                 :         {
                                161                 :             /*
                                162                 :              * Tuple is not equal to pending posting list tuple, or
                                163                 :              * _bt_dedup_save_htid() opted to not merge current item into
                                164                 :              * pending posting list for some other reason (e.g., adding more
                                165                 :              * TIDs would have caused posting list to exceed current
                                166                 :              * maxpostingsize).
                                167                 :              *
                                168                 :              * If state contains pending posting list with more than one item,
                                169                 :              * form new posting tuple and add it to our temp page (newpage).
                                170                 :              * Else add pending interval's base tuple to the temp page as-is.
                                171                 :              */
                                172         5591710 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
                                173                 : 
                                174         5591710 :             if (singlevalstrat)
                                175                 :             {
                                176                 :                 /*
                                177                 :                  * Single value strategy's extra steps.
                                178                 :                  *
                                179                 :                  * Lower maxpostingsize for sixth and final large posting list
                                180                 :                  * tuple at the point where 5 maxpostingsize-capped tuples
                                181                 :                  * have either been formed or observed.
                                182                 :                  *
                                183                 :                  * When a sixth maxpostingsize-capped item is formed/observed,
                                184                 :                  * stop merging together tuples altogether.  The few tuples
                                185                 :                  * that remain at the end of the page won't be merged together
                                186                 :                  * at all (at least not until after a future page split takes
                                187                 :                  * place, when this page's newly allocated right sibling page
                                188                 :                  * gets its first deduplication pass).
                                189                 :                  */
 1024 pg                        190 GIC        2380 :                 if (state->nmaxitems == 5)
 1138 pg                        191 CBC         318 :                     _bt_singleval_fillfactor(page, state, newitemsz);
 1024                           192            2062 :                 else if (state->nmaxitems == 6)
 1138 pg                        193 ECB             :                 {
 1138 pg                        194 GIC         123 :                     state->deduplicate = false;
 1138 pg                        195 CBC         123 :                     singlevalstrat = false; /* won't be back here */
 1138 pg                        196 ECB             :                 }
                                197                 :             }
                                198                 : 
                                199                 :             /* itup starts new pending posting list */
 1138 pg                        200 GIC     5591710 :             _bt_dedup_start_pending(state, itup, offnum);
 1138 pg                        201 ECB             :         }
                                202                 :     }
                                203                 : 
                                204                 :     /* Handle the last item */
 1138 pg                        205 GIC       28659 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
 1138 pg                        206 ECB             : 
                                207                 :     /*
                                208                 :      * If no items suitable for deduplication were found, newpage must be
                                209                 :      * exactly the same as the original page, so just return from function.
                                210                 :      *
                                211                 :      * We could determine whether or not to proceed on the basis the space
                                212                 :      * savings being sufficient to avoid an immediate page split instead.  We
                                213                 :      * don't do that because there is some small value in nbtsplitloc.c always
                                214                 :      * operating against a page that is fully deduplicated (apart from
                                215                 :      * newitem).  Besides, most of the cost has already been paid.
                                216                 :      */
 1138 pg                        217 GIC       28659 :     if (state->nintervals == 0)
 1138 pg                        218 ECB             :     {
                                219                 :         /* cannot leak memory here */
 1138 pg                        220 GIC        3729 :         pfree(newpage);
 1138 pg                        221 CBC        3729 :         pfree(state->htids);
                                222            3729 :         pfree(state);
                                223            3729 :         return;
 1138 pg                        224 ECB             :     }
                                225                 : 
                                226                 :     /*
                                227                 :      * By here, it's clear that deduplication will definitely go ahead.
                                228                 :      *
                                229                 :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
                                230                 :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
                                231                 :      * But keep things tidy.
                                232                 :      */
 1138 pg                        233 GIC       24930 :     if (P_HAS_GARBAGE(opaque))
 1138 pg                        234 ECB             :     {
  373 michael                   235 UIC           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
 1138 pg                        236 EUB             : 
 1138 pg                        237 UIC           0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
 1138 pg                        238 EUB             :     }
                                239                 : 
 1138 pg                        240 GIC       24930 :     START_CRIT_SECTION();
 1138 pg                        241 ECB             : 
 1138 pg                        242 GIC       24930 :     PageRestoreTempPage(newpage, page);
 1138 pg                        243 CBC       24930 :     MarkBufferDirty(buf);
 1138 pg                        244 ECB             : 
                                245                 :     /* XLOG stuff */
 1138 pg                        246 GIC       24930 :     if (RelationNeedsWAL(rel))
 1138 pg                        247 ECB             :     {
                                248                 :         XLogRecPtr  recptr;
                                249                 :         xl_btree_dedup xlrec_dedup;
                                250                 : 
 1138 pg                        251 GIC       24867 :         xlrec_dedup.nintervals = state->nintervals;
 1138 pg                        252 ECB             : 
 1138 pg                        253 GIC       24867 :         XLogBeginInsert();
 1138 pg                        254 CBC       24867 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
                                255           24867 :         XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
 1138 pg                        256 ECB             : 
                                257                 :         /*
                                258                 :          * The intervals array is not in the buffer, but pretend that it is.
                                259                 :          * When XLogInsert stores the whole buffer, the array need not be
                                260                 :          * stored too.
                                261                 :          */
 1138 pg                        262 GIC       24867 :         XLogRegisterBufData(0, (char *) state->intervals,
 1138 pg                        263 CBC       24867 :                             state->nintervals * sizeof(BTDedupInterval));
 1138 pg                        264 ECB             : 
 1138 pg                        265 GIC       24867 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
 1138 pg                        266 ECB             : 
 1138 pg                        267 GIC       24867 :         PageSetLSN(page, recptr);
 1138 pg                        268 ECB             :     }
                                269                 : 
 1138 pg                        270 GIC       24930 :     END_CRIT_SECTION();
 1138 pg                        271 ECB             : 
                                272                 :     /* Local space accounting should agree with page accounting */
 1138 pg                        273 GIC       24930 :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
 1138 pg                        274 ECB             : 
                                275                 :     /* cannot leak memory here */
 1138 pg                        276 GIC       24930 :     pfree(state->htids);
 1138 pg                        277 CBC       24930 :     pfree(state);
 1138 pg                        278 ECB             : }
                                279                 : 
                                280                 : /*
                                281                 :  * Perform bottom-up index deletion pass.
                                282                 :  *
                                283                 :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
                                284                 :  * be deleted via bottom-up index deletion.  The high level goal here is to
                                285                 :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
                                286                 :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
                                287                 :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
                                288                 :  * do not particularly care about once-off opportunities to delete many index
                                289                 :  * tuples together.
                                290                 :  *
                                291                 :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
                                292                 :  * See access/tableam.h for a description of how we're expected to cooperate
                                293                 :  * with the tableam.
                                294                 :  *
                                295                 :  * Returns true on success, in which case caller can assume page split will be
                                296                 :  * avoided for a reasonable amount of time.  Returns false when caller should
                                297                 :  * deduplicate the page (if possible at all).
                                298                 :  *
                                299                 :  * Note: Occasionally we return true despite failing to delete enough items to
                                300                 :  * avoid a split.  This makes caller skip deduplication and go split the page
                                301                 :  * right away.  Our return value is always just advisory information.
                                302                 :  *
                                303                 :  * Note: Caller should have already deleted all existing items with their
                                304                 :  * LP_DEAD bits set.
                                305                 :  */
                                306                 : bool
  816 pg                        307 GIC        3512 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
  816 pg                        308 ECB             :                      Size newitemsz)
                                309                 : {
                                310                 :     OffsetNumber offnum,
                                311                 :                 minoff,
                                312                 :                 maxoff;
  816 pg                        313 GIC        3512 :     Page        page = BufferGetPage(buf);
  373 michael                   314 CBC        3512 :     BTPageOpaque opaque = BTPageGetOpaque(page);
  816 pg                        315 ECB             :     BTDedupState state;
                                316                 :     TM_IndexDeleteOp delstate;
                                317                 :     bool        neverdedup;
  816 pg                        318 GIC        3512 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
  816 pg                        319 ECB             : 
                                320                 :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
  816 pg                        321 GIC        3512 :     newitemsz += sizeof(ItemIdData);
  816 pg                        322 ECB             : 
                                323                 :     /* Initialize deduplication state */
  816 pg                        324 GIC        3512 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
  816 pg                        325 CBC        3512 :     state->deduplicate = true;
                                326            3512 :     state->nmaxitems = 0;
                                327            3512 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
                                328            3512 :     state->base = NULL;
                                329            3512 :     state->baseoff = InvalidOffsetNumber;
                                330            3512 :     state->basetupsize = 0;
                                331            3512 :     state->htids = palloc(state->maxpostingsize);
                                332            3512 :     state->nhtids = 0;
                                333            3512 :     state->nitems = 0;
                                334            3512 :     state->phystupsize = 0;
                                335            3512 :     state->nintervals = 0;
  816 pg                        336 ECB             : 
                                337                 :     /*
                                338                 :      * Initialize tableam state that describes bottom-up index deletion
                                339                 :      * operation.
                                340                 :      *
                                341                 :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
                                342                 :      * can safely delete.  The tableam will search until our leaf page space
                                343                 :      * target is satisfied, or until the cost of continuing with the tableam
                                344                 :      * operation seems too high.  It focuses its efforts on TIDs associated
                                345                 :      * with duplicate index tuples that we mark "promising".
                                346                 :      *
                                347                 :      * This space target is a little arbitrary.  The tableam must be able to
                                348                 :      * keep the costs and benefits in balance.  We provide the tableam with
                                349                 :      * exhaustive information about what might work, without directly
                                350                 :      * concerning ourselves with avoiding work during the tableam call.  Our
                                351                 :      * role in costing the bottom-up deletion process is strictly advisory.
                                352                 :      */
  521 pg                        353 GIC        3512 :     delstate.irel = rel;
  521 pg                        354 CBC        3512 :     delstate.iblknum = BufferGetBlockNumber(buf);
  816                           355            3512 :     delstate.bottomup = true;
                                356            3512 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
                                357            3512 :     delstate.ndeltids = 0;
                                358            3512 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
                                359            3512 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
  816 pg                        360 ECB             : 
  816 pg                        361 GIC        3512 :     minoff = P_FIRSTDATAKEY(opaque);
  816 pg                        362 CBC        3512 :     maxoff = PageGetMaxOffsetNumber(page);
                                363            3512 :     for (offnum = minoff;
                                364         1030423 :          offnum <= maxoff;
                                365         1026911 :          offnum = OffsetNumberNext(offnum))
  816 pg                        366 ECB             :     {
  816 pg                        367 GIC     1026911 :         ItemId      itemid = PageGetItemId(page, offnum);
  816 pg                        368 CBC     1026911 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
  816 pg                        369 ECB             : 
  816 pg                        370 GIC     1026911 :         Assert(!ItemIdIsDead(itemid));
  816 pg                        371 ECB             : 
  816 pg                        372 GIC     1026911 :         if (offnum == minoff)
  816 pg                        373 ECB             :         {
                                374                 :             /* itup starts first pending interval */
  816 pg                        375 GIC        3512 :             _bt_dedup_start_pending(state, itup, offnum);
  816 pg                        376 ECB             :         }
  816 pg                        377 GIC     1177990 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
  816 pg                        378 CBC      154591 :                  _bt_dedup_save_htid(state, itup))
  816 pg                        379 ECB             :         {
                                380                 :             /* Tuple is equal; just added its TIDs to pending interval */
                                381                 :         }
                                382                 :         else
                                383                 :         {
                                384                 :             /* Finalize interval -- move its TIDs to delete state */
  816 pg                        385 GIC      868808 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
  816 pg                        386 ECB             : 
                                387                 :             /* itup starts new pending interval */
  816 pg                        388 GIC      868808 :             _bt_dedup_start_pending(state, itup, offnum);
  816 pg                        389 ECB             :         }
                                390                 :     }
                                391                 :     /* Finalize final interval -- move its TIDs to delete state */
  816 pg                        392 GIC        3512 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
  816 pg                        393 ECB             : 
                                394                 :     /*
                                395                 :      * We don't give up now in the event of having few (or even zero)
                                396                 :      * promising tuples for the tableam because it's not up to us as the index
                                397                 :      * AM to manage costs (note that the tableam might have heuristics of its
                                398                 :      * own that work out what to do).  We should at least avoid having our
                                399                 :      * caller do a useless deduplication pass after we return in the event of
                                400                 :      * zero promising tuples, though.
                                401                 :      */
  816 pg                        402 GIC        3512 :     neverdedup = false;
  816 pg                        403 CBC        3512 :     if (state->nintervals == 0)
                                404               5 :         neverdedup = true;
  816 pg                        405 ECB             : 
  816 pg                        406 GIC        3512 :     pfree(state->htids);
  816 pg                        407 CBC        3512 :     pfree(state);
  816 pg                        408 ECB             : 
                                409                 :     /* Ask tableam which TIDs are deletable, then physically delete them */
  816 pg                        410 GIC        3512 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
  816 pg                        411 ECB             : 
  816 pg                        412 GIC        3512 :     pfree(delstate.deltids);
  816 pg                        413 CBC        3512 :     pfree(delstate.status);
  816 pg                        414 ECB             : 
                                415                 :     /* Report "success" to caller unconditionally to avoid deduplication */
  816 pg                        416 GIC        3512 :     if (neverdedup)
  816 pg                        417 CBC           5 :         return true;
  816 pg                        418 ECB             : 
                                419                 :     /* Don't dedup when we won't end up back here any time soon anyway */
  816 pg                        420 GIC        3507 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
  816 pg                        421 ECB             : }
                                422                 : 
                                423                 : /*
                                424                 :  * Create a new pending posting list tuple based on caller's base tuple.
                                425                 :  *
                                426                 :  * Every tuple processed by deduplication either becomes the base tuple for a
                                427                 :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
                                428                 :  * A tuple that starts out as the base tuple for a posting list will only
                                429                 :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
                                430                 :  * that there are duplicates that can be merged into the base tuple.
                                431                 :  */
                                432                 : void
 1138 pg                        433 GIC     9274749 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
 1138 pg                        434 ECB             :                         OffsetNumber baseoff)
                                435                 : {
 1138 pg                        436 GIC     9274749 :     Assert(state->nhtids == 0);
 1138 pg                        437 CBC     9274749 :     Assert(state->nitems == 0);
                                438         9274749 :     Assert(!BTreeTupleIsPivot(base));
 1138 pg                        439 ECB             : 
                                440                 :     /*
                                441                 :      * Copy heap TID(s) from new base tuple for new candidate posting list
                                442                 :      * into working state's array
                                443                 :      */
 1138 pg                        444 GIC     9274749 :     if (!BTreeTupleIsPosting(base))
 1138 pg                        445 ECB             :     {
 1138 pg                        446 GIC     7434111 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
 1138 pg                        447 CBC     7434111 :         state->nhtids = 1;
                                448         7434111 :         state->basetupsize = IndexTupleSize(base);
 1138 pg                        449 ECB             :     }
                                450                 :     else
                                451                 :     {
                                452                 :         int         nposting;
                                453                 : 
 1138 pg                        454 GIC     1840638 :         nposting = BTreeTupleGetNPosting(base);
 1138 pg                        455 CBC     1840638 :         memcpy(state->htids, BTreeTupleGetPosting(base),
 1138 pg                        456 ECB             :                sizeof(ItemPointerData) * nposting);
 1138 pg                        457 GIC     1840638 :         state->nhtids = nposting;
 1138 pg                        458 ECB             :         /* basetupsize should not include existing posting list */
 1138 pg                        459 GIC     1840638 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
 1138 pg                        460 ECB             :     }
                                461                 : 
                                462                 :     /*
                                463                 :      * Save new base tuple itself -- it'll be needed if we actually create a
                                464                 :      * new posting list from new pending posting list.
                                465                 :      *
                                466                 :      * Must maintain physical size of all existing tuples (including line
                                467                 :      * pointer overhead) so that we can calculate space savings on page.
                                468                 :      */
 1138 pg                        469 GIC     9274749 :     state->nitems = 1;
 1138 pg                        470 CBC     9274749 :     state->base = base;
                                471         9274749 :     state->baseoff = baseoff;
                                472         9274749 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
 1138 pg                        473 ECB             :     /* Also save baseoff in pending state for interval */
 1138 pg                        474 GIC     9274749 :     state->intervals[state->nintervals].baseoff = state->baseoff;
 1138 pg                        475 CBC     9274749 : }
 1138 pg                        476 ECB             : 
                                477                 : /*
                                478                 :  * Save itup heap TID(s) into pending posting list where possible.
                                479                 :  *
                                480                 :  * Returns bool indicating if the pending posting list managed by state now
                                481                 :  * includes itup's heap TID(s).
                                482                 :  */
                                483                 : bool
 1138 pg                        484 GIC     1749807 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
 1138 pg                        485 ECB             : {
                                486                 :     int         nhtids;
                                487                 :     ItemPointer htids;
                                488                 :     Size        mergedtupsz;
                                489                 : 
 1138 pg                        490 GIC     1749807 :     Assert(!BTreeTupleIsPivot(itup));
 1138 pg                        491 ECB             : 
 1138 pg                        492 GIC     1749807 :     if (!BTreeTupleIsPosting(itup))
 1138 pg                        493 ECB             :     {
 1138 pg                        494 GIC     1739831 :         nhtids = 1;
 1138 pg                        495 CBC     1739831 :         htids = &itup->t_tid;
 1138 pg                        496 ECB             :     }
                                497                 :     else
                                498                 :     {
 1138 pg                        499 GIC        9976 :         nhtids = BTreeTupleGetNPosting(itup);
 1138 pg                        500 CBC        9976 :         htids = BTreeTupleGetPosting(itup);
 1138 pg                        501 ECB             :     }
                                502                 : 
                                503                 :     /*
                                504                 :      * Don't append (have caller finish pending posting list as-is) if
                                505                 :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
                                506                 :      *
                                507                 :      * This calculation needs to match the code used within _bt_form_posting()
                                508                 :      * for new posting list tuples.
                                509                 :      */
 1138 pg                        510 GIC     1749807 :     mergedtupsz = MAXALIGN(state->basetupsize +
 1138 pg                        511 ECB             :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
                                512                 : 
 1138 pg                        513 GIC     1749807 :     if (mergedtupsz > state->maxpostingsize)
 1024 pg                        514 ECB             :     {
                                515                 :         /*
                                516                 :          * Count this as an oversized item for single value strategy, though
                                517                 :          * only when there are 50 TIDs in the final posting list tuple.  This
                                518                 :          * limit (which is fairly arbitrary) avoids confusion about how many
                                519                 :          * 1/6 of a page tuples have been encountered/created by the current
                                520                 :          * deduplication pass.
                                521                 :          *
                                522                 :          * Note: We deliberately don't consider which deduplication pass
                                523                 :          * merged together tuples to create this item (could be a previous
                                524                 :          * deduplication pass, or current pass).  See _bt_do_singleval()
                                525                 :          * comments.
                                526                 :          */
 1024 pg                        527 GIC       14378 :         if (state->nhtids > 50)
 1024 pg                        528 CBC       13882 :             state->nmaxitems++;
 1024 pg                        529 ECB             : 
 1138 pg                        530 GIC       14378 :         return false;
 1024 pg                        531 ECB             :     }
                                532                 : 
                                533                 :     /*
                                534                 :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
                                535                 :      * pending posting list
                                536                 :      */
 1138 pg                        537 GIC     1735429 :     state->nitems++;
 1138 pg                        538 CBC     1735429 :     memcpy(state->htids + state->nhtids, htids,
 1138 pg                        539 ECB             :            sizeof(ItemPointerData) * nhtids);
 1138 pg                        540 GIC     1735429 :     state->nhtids += nhtids;
 1138 pg                        541 CBC     1735429 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
 1138 pg                        542 ECB             : 
 1138 pg                        543 GIC     1735429 :     return true;
 1138 pg                        544 ECB             : }
                                545                 : 
                                546                 : /*
                                547                 :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
                                548                 :  * is based on saved base tuple, and saved list of heap TIDs.
                                549                 :  *
                                550                 :  * Returns space saving from deduplicating to make a new posting list tuple.
                                551                 :  * Note that this includes line pointer overhead.  This is zero in the case
                                552                 :  * where no deduplication was possible.
                                553                 :  */
                                554                 : Size
 1138 pg                        555 GIC     6000318 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
 1138 pg                        556 ECB             : {
                                557                 :     OffsetNumber tupoff;
                                558                 :     Size        tuplesz;
                                559                 :     Size        spacesaving;
                                560                 : 
 1138 pg                        561 GIC     6000318 :     Assert(state->nitems > 0);
 1138 pg                        562 CBC     6000318 :     Assert(state->nitems <= state->nhtids);
                                563         6000318 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
 1138 pg                        564 ECB             : 
 1138 pg                        565 GIC     6000318 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
 1138 pg                        566 CBC     6000318 :     if (state->nitems == 1)
 1138 pg                        567 ECB             :     {
                                568                 :         /* Use original, unchanged base tuple */
 1138 pg                        569 GIC     5657980 :         tuplesz = IndexTupleSize(state->base);
  247 pg                        570 GNC     5657980 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
                                571         5657980 :         Assert(tuplesz <= BTMaxItemSize(newpage));
 1138 pg                        572 CBC     5657980 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
 1138 pg                        573 ECB             :                         false, false) == InvalidOffsetNumber)
 1138 pg                        574 LBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
 1138 pg                        575 ECB             : 
 1138 pg                        576 GIC     5657980 :         spacesaving = 0;
 1138 pg                        577 EUB             :     }
                                578                 :     else
 1138 pg                        579 ECB             :     {
                                580                 :         IndexTuple  final;
                                581                 : 
                                582                 :         /* Form a tuple with a posting list */
 1138 pg                        583 GIC      342338 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
                                584          342338 :         tuplesz = IndexTupleSize(final);
                                585          342338 :         Assert(tuplesz <= state->maxpostingsize);
 1138 pg                        586 ECB             : 
                                587                 :         /* Save final number of items for posting list */
 1138 pg                        588 CBC      342338 :         state->intervals[state->nintervals].nitems = state->nitems;
                                589                 : 
 1138 pg                        590 GIC      342338 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
  247 pg                        591 GNC      342338 :         Assert(tuplesz <= BTMaxItemSize(newpage));
 1138 pg                        592 CBC      342338 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
                                593                 :                         false) == InvalidOffsetNumber)
 1138 pg                        594 LBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
 1138 pg                        595 ECB             : 
 1138 pg                        596 CBC      342338 :         pfree(final);
 1138 pg                        597 GIC      342338 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
 1138 pg                        598 EUB             :         /* Increment nintervals, since we wrote a new posting list tuple */
 1138 pg                        599 GIC      342338 :         state->nintervals++;
 1138 pg                        600 CBC      342338 :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
 1138 pg                        601 ECB             :     }
                                602                 : 
                                603                 :     /* Reset state for next pending posting list */
 1138 pg                        604 CBC     6000318 :     state->nhtids = 0;
 1138 pg                        605 GIC     6000318 :     state->nitems = 0;
                                606         6000318 :     state->phystupsize = 0;
                                607                 : 
 1138 pg                        608 CBC     6000318 :     return spacesaving;
 1138 pg                        609 ECB             : }
                                610                 : 
                                611                 : /*
  816                           612                 :  * Finalize interval during bottom-up index deletion.
                                613                 :  *
                                614                 :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
                                615                 :  * first, and then get moved over to delstate (in variable-sized batches) by
                                616                 :  * calling here.  Call here happens when the number of TIDs in a dedup
                                617                 :  * interval is known, and interval gets finalized (i.e. when caller sees next
                                618                 :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
                                619                 :  * process from leaf page).
                                620                 :  *
                                621                 :  * This is where bottom-up deletion determines and remembers which entries are
                                622                 :  * duplicates.  This will be important information to the tableam delete
                                623                 :  * infrastructure later on.  Plain index tuple duplicates are marked
                                624                 :  * "promising" here, per tableam contract.
                                625                 :  *
                                626                 :  * Our approach to marking entries whose TIDs come from posting lists is more
                                627                 :  * complicated.  Posting lists can only be formed by a deduplication pass (or
                                628                 :  * during an index build), so recent version churn affecting the pointed-to
                                629                 :  * logical rows is not particularly likely.  We may still give a weak signal
                                630                 :  * about posting list tuples' entries (by marking just one of its TIDs/entries
                                631                 :  * promising), though this is only a possibility in the event of further
                                632                 :  * duplicate index tuples in final interval that covers posting list tuple (as
                                633                 :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
                                634                 :  * when it has no stronger signal to go with for the deletion operation as a
                                635                 :  * whole.
                                636                 :  *
                                637                 :  * The heuristics we use work well in practice because we only need to give
                                638                 :  * the tableam the right _general_ idea about where to look.  Garbage tends to
                                639                 :  * naturally get concentrated in relatively few table blocks with workloads
                                640                 :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
                                641                 :  * available table blocks sensibly based on the hints we provide, but that's
                                642                 :  * okay -- only the extremes matter.  The tableam just needs to be able to
                                643                 :  * predict which few table blocks will have the most tuples that are safe to
                                644                 :  * delete for each deletion operation, with low variance across related
                                645                 :  * deletion operations.
                                646                 :  */
                                647                 : static void
  816 pg                        648 GIC      872320 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                649                 :                                TM_IndexDeleteOp *delstate)
                                650                 : {
                                651          872320 :     bool        dupinterval = (state->nitems > 1);
  816 pg                        652 ECB             : 
  816 pg                        653 GIC      872320 :     Assert(state->nitems > 0);
                                654          872320 :     Assert(state->nitems <= state->nhtids);
  816 pg                        655 CBC      872320 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
                                656                 : 
                                657         1899231 :     for (int i = 0; i < state->nitems; i++)
  816 pg                        658 ECB             :     {
  816 pg                        659 CBC     1026911 :         OffsetNumber offnum = state->baseoff + i;
  816 pg                        660 GIC     1026911 :         ItemId      itemid = PageGetItemId(page, offnum);
  816 pg                        661 CBC     1026911 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
  816 pg                        662 GIC     1026911 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
  816 pg                        663 CBC     1026911 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
  816 pg                        664 ECB             : 
  816 pg                        665 CBC     1026911 :         if (!BTreeTupleIsPosting(itup))
  816 pg                        666 ECB             :         {
                                667                 :             /* Simple case: A plain non-pivot tuple */
  816 pg                        668 GIC      858455 :             ideltid->tid = itup->t_tid;
  816 pg                        669 CBC      858455 :             ideltid->id = delstate->ndeltids;
  816 pg                        670 GIC      858455 :             istatus->idxoffnum = offnum;
                                671          858455 :             istatus->knowndeletable = false; /* for now */
  816 pg                        672 CBC      858455 :             istatus->promising = dupinterval;    /* simple rule */
                                673          858455 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
  816 pg                        674 ECB             : 
  816 pg                        675 CBC      858455 :             delstate->ndeltids++;
  816 pg                        676 ECB             :         }
                                677                 :         else
                                678                 :         {
                                679                 :             /*
                                680                 :              * Complicated case: A posting list tuple.
                                681                 :              *
                                682                 :              * We make the conservative assumption that there can only be at
                                683                 :              * most one affected logical row per posting list tuple.  There
                                684                 :              * will be at most one promising entry in deltids to represent
                                685                 :              * this presumed lone logical row.  Note that this isn't even
                                686                 :              * considered unless the posting list tuple is also in an interval
                                687                 :              * of duplicates -- this complicated rule is just a variant of the
                                688                 :              * simple rule used to decide if plain index tuples are promising.
                                689                 :              */
  816 pg                        690 GIC      168456 :             int         nitem = BTreeTupleGetNPosting(itup);
                                691          168456 :             bool        firstpromising = false;
                                692          168456 :             bool        lastpromising = false;
                                693                 : 
  816 pg                        694 CBC      168456 :             Assert(_bt_posting_valid(itup));
  816 pg                        695 ECB             : 
  816 pg                        696 CBC      168456 :             if (dupinterval)
                                697                 :             {
  816 pg                        698 ECB             :                 /*
                                699                 :                  * Complicated rule: either the first or last TID in the
                                700                 :                  * posting list gets marked promising (if any at all)
                                701                 :                  */
                                702                 :                 BlockNumber minblocklist,
                                703                 :                             midblocklist,
                                704                 :                             maxblocklist;
                                705                 :                 ItemPointer mintid,
                                706                 :                             midtid,
                                707                 :                             maxtid;
                                708                 : 
  816 pg                        709 GIC       10036 :                 mintid = BTreeTupleGetHeapTID(itup);
                                710           10036 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
                                711           10036 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
                                712           10036 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
  816 pg                        713 CBC       10036 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
                                714           10036 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
  816 pg                        715 ECB             : 
                                716                 :                 /* Only entry with predominant table block can be promising */
  816 pg                        717 CBC       10036 :                 firstpromising = (minblocklist == midblocklist);
                                718           10036 :                 lastpromising = (!firstpromising &&
                                719                 :                                  midblocklist == maxblocklist);
                                720                 :             }
  816 pg                        721 ECB             : 
  816 pg                        722 CBC      791085 :             for (int p = 0; p < nitem; p++)
                                723                 :             {
  816 pg                        724 GIC      622629 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
                                725                 : 
  816 pg                        726 CBC      622629 :                 ideltid->tid = *htid;
  816 pg                        727 GIC      622629 :                 ideltid->id = delstate->ndeltids;
  816 pg                        728 CBC      622629 :                 istatus->idxoffnum = offnum;
  816 pg                        729 GIC      622629 :                 istatus->knowndeletable = false; /* for now */
  816 pg                        730 CBC      622629 :                 istatus->promising = false;
                                731          622629 :                 if ((firstpromising && p == 0) ||
                                732           68395 :                     (lastpromising && p == nitem - 1))
                                733            6802 :                     istatus->promising = true;
                                734          622629 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
  816 pg                        735 ECB             : 
  816 pg                        736 CBC      622629 :                 ideltid++;
                                737          622629 :                 istatus++;
                                738          622629 :                 delstate->ndeltids++;
                                739                 :             }
  816 pg                        740 ECB             :         }
                                741                 :     }
                                742                 : 
  816 pg                        743 GIC      872320 :     if (dupinterval)
                                744                 :     {
                                745           95295 :         state->intervals[state->nintervals].nitems = state->nitems;
                                746           95295 :         state->nintervals++;
  816 pg                        747 ECB             :     }
                                748                 : 
                                749                 :     /* Reset state for next interval */
  816 pg                        750 CBC      872320 :     state->nhtids = 0;
  816 pg                        751 GIC      872320 :     state->nitems = 0;
                                752          872320 :     state->phystupsize = 0;
                                753          872320 : }
  816 pg                        754 ECB             : 
 1138                           755                 : /*
                                756                 :  * Determine if page non-pivot tuples (data items) are all duplicates of the
                                757                 :  * same value -- if they are, deduplication's "single value" strategy should
                                758                 :  * be applied.  The general goal of this strategy is to ensure that
                                759                 :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
                                760                 :  * split point as further duplicates are inserted, and successive rightmost
                                761                 :  * page splits occur among pages that store the same duplicate value.  When
                                762                 :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
                                763                 :  * just like it would if deduplication were disabled.
                                764                 :  *
                                765                 :  * We expect that affected workloads will require _several_ single value
                                766                 :  * strategy deduplication passes (over a page that only stores duplicates)
                                767                 :  * before the page is finally split.  The first deduplication pass should only
                                768                 :  * find regular non-pivot tuples.  Later deduplication passes will find
                                769                 :  * existing maxpostingsize-capped posting list tuples, which must be skipped
                                770                 :  * over.  The penultimate pass is generally the first pass that actually
                                771                 :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
                                772                 :  * few untouched non-pivot tuples.  The final deduplication pass won't free
                                773                 :  * any space -- it will skip over everything without merging anything (it
                                774                 :  * retraces the steps of the penultimate pass).
                                775                 :  *
                                776                 :  * Fortunately, having several passes isn't too expensive.  Each pass (after
                                777                 :  * the first pass) won't spend many cycles on the large posting list tuples
                                778                 :  * left by previous passes.  Each pass will find a large contiguous group of
                                779                 :  * smaller duplicate tuples to merge together at the end of the page.
                                780                 :  */
                                781                 : static bool
 1138 pg                        782 GIC       25291 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                783                 :                  OffsetNumber minoff, IndexTuple newitem)
                                784                 : {
 1107                           785           25291 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 1138 pg                        786 ECB             :     ItemId      itemid;
                                787                 :     IndexTuple  itup;
                                788                 : 
 1138 pg                        789 CBC       25291 :     itemid = PageGetItemId(page, minoff);
 1138 pg                        790 GIC       25291 :     itup = (IndexTuple) PageGetItem(page, itemid);
                                791                 : 
 1107                           792           25291 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
 1138 pg                        793 ECB             :     {
 1138 pg                        794 CBC         926 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
 1138 pg                        795 GIC         926 :         itup = (IndexTuple) PageGetItem(page, itemid);
 1138 pg                        796 ECB             : 
 1107 pg                        797 GIC         926 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
 1138 pg                        798 CBC         568 :             return true;
 1138 pg                        799 ECB             :     }
                                800                 : 
 1138 pg                        801 CBC       24723 :     return false;
 1138 pg                        802 ECB             : }
                                803                 : 
                                804                 : /*
                                805                 :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
                                806                 :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
                                807                 :  * tuple will end up somewhat smaller than the first five.  (Note: The first
                                808                 :  * five tuples could actually just be very large duplicate tuples that
                                809                 :  * couldn't be merged together at all.  Deduplication will simply not modify
                                810                 :  * the page when that happens.)
                                811                 :  *
                                812                 :  * When there are six posting lists on the page (after current deduplication
                                813                 :  * pass goes on to create/observe a sixth very large tuple), caller should end
                                814                 :  * its deduplication pass.  It isn't useful to try to deduplicate items that
                                815                 :  * are supposed to end up on the new right sibling page following the
                                816                 :  * anticipated page split.  A future deduplication pass of future right
                                817                 :  * sibling page might take care of it.  (This is why the first single value
                                818                 :  * strategy deduplication pass for a given leaf page will generally find only
                                819                 :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
                                820                 :  */
                                821                 : static void
 1138 pg                        822 GIC         318 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
                                823                 : {
                                824                 :     Size        leftfree;
                                825                 :     int         reduction;
 1138 pg                        826 ECB             : 
                                827                 :     /* This calculation needs to match nbtsplitloc.c */
 1138 pg                        828 GIC         318 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
                                829                 :         MAXALIGN(sizeof(BTPageOpaqueData));
                                830                 :     /* Subtract size of new high key (includes pivot heap TID space) */
                                831             318 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
 1138 pg                        832 ECB             : 
                                833                 :     /*
                                834                 :      * Reduce maxpostingsize by an amount equal to target free space on left
                                835                 :      * half of page
                                836                 :      */
 1138 pg                        837 GIC         318 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
                                838             318 :     if (state->maxpostingsize > reduction)
                                839             318 :         state->maxpostingsize -= reduction;
                                840                 :     else
 1138 pg                        841 LBC           0 :         state->maxpostingsize = 0;
 1138 pg                        842 CBC         318 : }
 1138 pg                        843 ECB             : 
                                844                 : /*
 1138 pg                        845 EUB             :  * Build a posting list tuple based on caller's "base" index tuple and list of
 1138 pg                        846 ECB             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
                                847                 :  * posting list. (Posting list tuples can never have a single heap TID, partly
                                848                 :  * because that ensures that deduplication always reduces final MAXALIGN()'d
                                849                 :  * size of entire tuple.)
                                850                 :  *
                                851                 :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
                                852                 :  * than a SHORTALIGN()'d offset), in line with the approach taken when
                                853                 :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
                                854                 :  * This sometimes wastes a little space that was only needed as alignment
                                855                 :  * padding in the original tuple.  Following this convention simplifies the
                                856                 :  * space accounting used when deduplicating a page (the same convention
                                857                 :  * simplifies the accounting for choosing a point to split a page at).
                                858                 :  *
                                859                 :  * Note: Caller's "htids" array must be unique and already in ascending TID
                                860                 :  * order.  Any existing heap TIDs from "base" won't automatically appear in
                                861                 :  * returned posting list tuple (they must be included in htids array.)
                                862                 :  */
                                863                 : IndexTuple
 1138 pg                        864 GIC      381463 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
                                865                 : {
                                866                 :     uint32      keysize,
                                867                 :                 newsize;
 1138 pg                        868 ECB             :     IndexTuple  itup;
                                869                 : 
 1138 pg                        870 GIC      381463 :     if (BTreeTupleIsPosting(base))
                                871           69051 :         keysize = BTreeTupleGetPostingOffset(base);
                                872                 :     else
                                873          312412 :         keysize = IndexTupleSize(base);
 1138 pg                        874 ECB             : 
 1138 pg                        875 CBC      381463 :     Assert(!BTreeTupleIsPivot(base));
 1138 pg                        876 GIC      381463 :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
 1138 pg                        877 CBC      381463 :     Assert(keysize == MAXALIGN(keysize));
                                878                 : 
 1138 pg                        879 ECB             :     /* Determine final size of new tuple */
 1138 pg                        880 CBC      381463 :     if (nhtids > 1)
                                881          372094 :         newsize = MAXALIGN(keysize +
                                882                 :                            nhtids * sizeof(ItemPointerData));
                                883                 :     else
                                884            9369 :         newsize = keysize;
 1138 pg                        885 ECB             : 
 1138 pg                        886 GIC      381463 :     Assert(newsize <= INDEX_SIZE_MASK);
                                887          381463 :     Assert(newsize == MAXALIGN(newsize));
 1138 pg                        888 ECB             : 
                                889                 :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
 1138 pg                        890 CBC      381463 :     itup = palloc0(newsize);
                                891          381463 :     memcpy(itup, base, keysize);
 1138 pg                        892 GIC      381463 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                893          381463 :     itup->t_info |= newsize;
 1138 pg                        894 CBC      381463 :     if (nhtids > 1)
 1138 pg                        895 ECB             :     {
                                896                 :         /* Form posting list tuple */
 1138 pg                        897 CBC      372094 :         BTreeTupleSetPosting(itup, nhtids, keysize);
                                898          372094 :         memcpy(BTreeTupleGetPosting(itup), htids,
                                899                 :                sizeof(ItemPointerData) * nhtids);
 1138 pg                        900 GIC      372094 :         Assert(_bt_posting_valid(itup));
 1138 pg                        901 ECB             :     }
                                902                 :     else
                                903                 :     {
                                904                 :         /* Form standard non-pivot tuple */
 1138 pg                        905 GIC        9369 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                906            9369 :         ItemPointerCopy(htids, &itup->t_tid);
                                907            9369 :         Assert(ItemPointerIsValid(&itup->t_tid));
                                908                 :     }
 1138 pg                        909 ECB             : 
 1138 pg                        910 CBC      381463 :     return itup;
 1138 pg                        911 ECB             : }
                                912                 : 
                                913                 : /*
                                914                 :  * Generate a replacement tuple by "updating" a posting list tuple so that it
                                915                 :  * no longer has TIDs that need to be deleted.
                                916                 :  *
                                917                 :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
                                918                 :  * points to the existing posting list tuple to be updated.
                                919                 :  *
                                920                 :  * On return, caller's vacposting argument will point to final "updated"
                                921                 :  * tuple, which will be palloc()'d in caller's memory context.
                                922                 :  */
                                923                 : void
 1138 pg                        924 GIC       87661 : _bt_update_posting(BTVacuumPosting vacposting)
                                925                 : {
                                926           87661 :     IndexTuple  origtuple = vacposting->itup;
                                927                 :     uint32      keysize,
 1138 pg                        928 ECB             :                 newsize;
                                929                 :     IndexTuple  itup;
                                930                 :     int         nhtids;
                                931                 :     int         ui,
                                932                 :                 d;
                                933                 :     ItemPointer htids;
                                934                 : 
 1138 pg                        935 GIC       87661 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
                                936                 : 
                                937           87661 :     Assert(_bt_posting_valid(origtuple));
                                938           87661 :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
 1138 pg                        939 ECB             : 
                                940                 :     /*
                                941                 :      * Determine final size of new tuple.
                                942                 :      *
                                943                 :      * This calculation needs to match the code used within _bt_form_posting()
                                944                 :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
                                945                 :      * to save ourselves a second memory allocation for a htids workspace.
                                946                 :      */
 1134 pg                        947 GIC       87661 :     keysize = BTreeTupleGetPostingOffset(origtuple);
 1138                           948           87661 :     if (nhtids > 1)
                                949            6637 :         newsize = MAXALIGN(keysize +
                                950                 :                            nhtids * sizeof(ItemPointerData));
 1138 pg                        951 ECB             :     else
 1138 pg                        952 CBC       81024 :         newsize = keysize;
 1138 pg                        953 ECB             : 
 1133 pg                        954 GIC       87661 :     Assert(newsize <= INDEX_SIZE_MASK);
                                955           87661 :     Assert(newsize == MAXALIGN(newsize));
 1133 pg                        956 ECB             : 
                                957                 :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
 1138 pg                        958 CBC       87661 :     itup = palloc0(newsize);
                                959           87661 :     memcpy(itup, origtuple, keysize);
 1138 pg                        960 GIC       87661 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                961           87661 :     itup->t_info |= newsize;
 1138 pg                        962 ECB             : 
 1138 pg                        963 CBC       87661 :     if (nhtids > 1)
 1138 pg                        964 ECB             :     {
                                965                 :         /* Form posting list tuple */
 1138 pg                        966 GIC        6637 :         BTreeTupleSetPosting(itup, nhtids, keysize);
 1138 pg                        967 CBC        6637 :         htids = BTreeTupleGetPosting(itup);
                                968                 :     }
                                969                 :     else
 1138 pg                        970 ECB             :     {
                                971                 :         /* Form standard non-pivot tuple */
 1138 pg                        972 GIC       81024 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                973           81024 :         htids = &itup->t_tid;
                                974                 :     }
                                975                 : 
 1138 pg                        976 CBC       87661 :     ui = 0;
                                977           87661 :     d = 0;
 1138 pg                        978 GIC      372085 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
                                979                 :     {
 1138 pg                        980 CBC      284424 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
 1138 pg                        981 ECB             :         {
 1138 pg                        982 CBC      120806 :             d++;
 1138 pg                        983 GIC      120806 :             continue;
 1138 pg                        984 ECB             :         }
 1138 pg                        985 GIC      163618 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
 1138 pg                        986 ECB             :     }
 1138 pg                        987 CBC       87661 :     Assert(ui == nhtids);
 1138 pg                        988 GIC       87661 :     Assert(d == vacposting->ndeletedtids);
 1138 pg                        989 CBC       87661 :     Assert(nhtids == 1 || _bt_posting_valid(itup));
 1133 pg                        990 GIC       87661 :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
 1138 pg                        991 ECB             : 
                                992                 :     /* vacposting arg's itup will now point to updated version */
 1138 pg                        993 CBC       87661 :     vacposting->itup = itup;
                                994           87661 : }
                                995                 : 
                                996                 : /*
 1138 pg                        997 ECB             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
                                998                 :  * TID from original posting list (the 'oposting' heap TID located at offset
                                999                 :  * 'postingoff').  Modifies newitem, so caller should pass their own private
                               1000                 :  * copy that can safely be modified.
                               1001                 :  *
                               1002                 :  * Returns new posting list tuple, which is palloc()'d in caller's context.
                               1003                 :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
                               1004                 :  * what caller actually inserts. (This happens inside the same critical
                               1005                 :  * section that performs an in-place update of old posting list using new
                               1006                 :  * posting list returned here.)
                               1007                 :  *
                               1008                 :  * While the keys from newitem and oposting must be opclass equal, and must
                               1009                 :  * generate identical output when run through the underlying type's output
                               1010                 :  * function, it doesn't follow that their representations match exactly.
                               1011                 :  * Caller must avoid assuming that there can't be representational differences
                               1012                 :  * that make datums from oposting bigger or smaller than the corresponding
                               1013                 :  * datums from newitem.  For example, differences in TOAST input state might
                               1014                 :  * break a faulty assumption about tuple size (the executor is entitled to
                               1015                 :  * apply TOAST compression based on its own criteria).  It also seems possible
                               1016                 :  * that further representational variation will be introduced in the future,
                               1017                 :  * in order to support nbtree features like page-level prefix compression.
                               1018                 :  *
                               1019                 :  * See nbtree/README for details on the design of posting list splits.
                               1020                 :  */
                               1021                 : IndexTuple
 1138 pg                       1022 GIC        9794 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
                               1023                 : {
                               1024                 :     int         nhtids;
                               1025                 :     char       *replacepos;
 1138 pg                       1026 ECB             :     char       *replaceposright;
                               1027                 :     Size        nmovebytes;
                               1028                 :     IndexTuple  nposting;
                               1029                 : 
 1138 pg                       1030 GIC        9794 :     nhtids = BTreeTupleGetNPosting(oposting);
                               1031            9794 :     Assert(_bt_posting_valid(oposting));
                               1032                 : 
                               1033                 :     /*
  695 pg                       1034 ECB             :      * The postingoff argument originated as a _bt_binsrch_posting() return
                               1035                 :      * value.  It will be 0 in the event of corruption that makes a leaf page
                               1036                 :      * contain a non-pivot tuple that's somehow identical to newitem (no two
                               1037                 :      * non-pivot tuples should ever have the same TID).  This has been known
                               1038                 :      * to happen in the field from time to time.
                               1039                 :      *
                               1040                 :      * Perform a basic sanity check to catch this case now.
                               1041                 :      */
  695 pg                       1042 GIC        9794 :     if (!(postingoff > 0 && postingoff < nhtids))
  695 pg                       1043 UIC           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
                               1044                 :              nhtids, postingoff);
                               1045                 : 
 1138 pg                       1046 ECB             :     /*
 1138 pg                       1047 EUB             :      * Move item pointers in posting list to make a gap for the new item's
                               1048                 :      * heap TID.  We shift TIDs one place to the right, losing original
                               1049                 :      * rightmost TID. (nmovebytes must not include TIDs to the left of
                               1050                 :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
                               1051                 :      */
 1138 pg                       1052 GIC        9794 :     nposting = CopyIndexTuple(oposting);
                               1053            9794 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
                               1054            9794 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
                               1055            9794 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
 1138 pg                       1056 CBC        9794 :     memmove(replaceposright, replacepos, nmovebytes);
 1138 pg                       1057 ECB             : 
                               1058                 :     /* Fill the gap at postingoff with TID of new item (original new TID) */
 1138 pg                       1059 CBC        9794 :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
                               1060            9794 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
                               1061                 : 
                               1062                 :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
                               1063            9794 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
 1138 pg                       1064 ECB             : 
 1138 pg                       1065 GIC        9794 :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
                               1066                 :                               BTreeTupleGetHeapTID(newitem)) < 0);
 1138 pg                       1067 CBC        9794 :     Assert(_bt_posting_valid(nposting));
                               1068                 : 
                               1069            9794 :     return nposting;
                               1070                 : }
 1138 pg                       1071 ECB             : 
                               1072                 : /*
                               1073                 :  * Verify posting list invariants for "posting", which must be a posting list
                               1074                 :  * tuple.  Used within assertions.
                               1075                 :  */
                               1076                 : #ifdef USE_ASSERT_CHECKING
                               1077                 : static bool
 1138 pg                       1078 GIC      654436 : _bt_posting_valid(IndexTuple posting)
                               1079                 : {
                               1080                 :     ItemPointerData last;
                               1081                 :     ItemPointer htid;
 1138 pg                       1082 ECB             : 
 1138 pg                       1083 GIC      654436 :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
 1138 pg                       1084 UIC           0 :         return false;
                               1085                 : 
                               1086                 :     /* Remember first heap TID for loop */
 1138 pg                       1087 CBC      654436 :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
 1138 pg                       1088 GBC      654436 :     if (!ItemPointerIsValid(&last))
 1138 pg                       1089 UIC           0 :         return false;
                               1090                 : 
 1138 pg                       1091 ECB             :     /* Iterate, starting from second TID */
 1138 pg                       1092 CBC     7657174 :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
 1138 pg                       1093 EUB             :     {
 1138 pg                       1094 GIC     7002738 :         htid = BTreeTupleGetPostingN(posting, i);
                               1095                 : 
 1138 pg                       1096 CBC     7002738 :         if (!ItemPointerIsValid(htid))
 1138 pg                       1097 UIC           0 :             return false;
 1138 pg                       1098 CBC     7002738 :         if (ItemPointerCompare(htid, &last) <= 0)
 1138 pg                       1099 UIC           0 :             return false;
 1138 pg                       1100 CBC     7002738 :         ItemPointerCopy(htid, &last);
 1138 pg                       1101 EUB             :     }
 1138 pg                       1102 ECB             : 
 1138 pg                       1103 GBC      654436 :     return true;
 1138 pg                       1104 ECB             : }
                               1105                 : #endif

Generated by: LCOV version v1.16-55-g56c0a2a