LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB
Current: Differential Code Coverage HEAD vs 15 Lines: 96.8 % 346 335 3 7 1 2 155 3 175 8 157
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 12 12 11 1 11
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtdedup.c
       4                 :  *    Deduplicate or bottom-up delete items in Postgres btrees.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/nbtree/nbtdedup.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : #include "postgres.h"
      16                 : 
      17                 : #include "access/nbtree.h"
      18                 : #include "access/nbtxlog.h"
      19                 : #include "access/xloginsert.h"
      20                 : #include "miscadmin.h"
      21                 : #include "utils/rel.h"
      22                 : 
      23                 : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
      24                 :                                            TM_IndexDeleteOp *delstate);
      25                 : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
      26                 :                              OffsetNumber minoff, IndexTuple newitem);
      27                 : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
      28                 :                                      Size newitemsz);
      29                 : #ifdef USE_ASSERT_CHECKING
      30                 : static bool _bt_posting_valid(IndexTuple posting);
      31                 : #endif
      32                 : 
      33                 : /*
      34                 :  * Perform a deduplication pass.
      35                 :  *
      36                 :  * The general approach taken here is to perform as much deduplication as
      37                 :  * possible to free as much space as possible.  Note, however, that "single
      38                 :  * value" strategy is used for !bottomupdedup callers when the page is full of
      39                 :  * tuples of a single value.  Deduplication passes that apply the strategy
      40                 :  * will leave behind a few untouched tuples at the end of the page, preparing
      41                 :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
      42                 :  * value strategy.  Our high level goal is to delay merging the untouched
      43                 :  * tuples until after the page splits.
      44                 :  *
      45                 :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
      46                 :  * high level goal is to prevent a page split entirely by buying more time.
      47                 :  * We still hope that a page split can be avoided altogether.  That's why
      48                 :  * single value strategy is not even considered for bottomupdedup callers.
      49                 :  *
      50                 :  * The page will have to be split if we cannot successfully free at least
      51                 :  * newitemsz (we also need space for newitem's line pointer, which isn't
      52                 :  * included in caller's newitemsz).
      53                 :  *
      54                 :  * Note: Caller should have already deleted all existing items with their
      55                 :  * LP_DEAD bits set.
      56                 :  */
      57                 : void
      58 CBC       28659 : _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
      59                 :                Size newitemsz, bool bottomupdedup)
      60                 : {
      61                 :     OffsetNumber offnum,
      62                 :                 minoff,
      63                 :                 maxoff;
      64           28659 :     Page        page = BufferGetPage(buf);
      65           28659 :     BTPageOpaque opaque = BTPageGetOpaque(page);
      66                 :     Page        newpage;
      67                 :     BTDedupState state;
      68           28659 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
      69           28659 :     bool        singlevalstrat = false;
      70           28659 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
      71                 : 
      72                 :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
      73           28659 :     newitemsz += sizeof(ItemIdData);
      74                 : 
      75                 :     /*
      76                 :      * Initialize deduplication state.
      77                 :      *
      78                 :      * It would be possible for maxpostingsize (limit on posting list tuple
      79                 :      * size) to be set to one third of the page.  However, it seems like a
      80                 :      * good idea to limit the size of posting lists to one sixth of a page.
      81                 :      * That ought to leave us with a good split point when pages full of
      82                 :      * duplicates can be split several times.
      83                 :      */
      84           28659 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
      85           28659 :     state->deduplicate = true;
      86           28659 :     state->nmaxitems = 0;
      87           28659 :     state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
      88                 :     /* Metadata about base tuple of current pending posting list */
      89           28659 :     state->base = NULL;
      90           28659 :     state->baseoff = InvalidOffsetNumber;
      91           28659 :     state->basetupsize = 0;
      92                 :     /* Metadata about current pending posting list TIDs */
      93           28659 :     state->htids = palloc(state->maxpostingsize);
      94           28659 :     state->nhtids = 0;
      95           28659 :     state->nitems = 0;
      96                 :     /* Size of all physical tuples to be replaced by pending posting list */
      97           28659 :     state->phystupsize = 0;
      98                 :     /* nintervals should be initialized to zero */
      99           28659 :     state->nintervals = 0;
     100                 : 
     101           28659 :     minoff = P_FIRSTDATAKEY(opaque);
     102           28659 :     maxoff = PageGetMaxOffsetNumber(page);
     103                 : 
     104                 :     /*
     105                 :      * Consider applying "single value" strategy, though only if the page
     106                 :      * seems likely to be split in the near future
     107                 :      */
     108           28659 :     if (!bottomupdedup)
     109           25291 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
     110                 : 
     111                 :     /*
     112                 :      * Deduplicate items from page, and write them to newpage.
     113                 :      *
     114                 :      * Copy the original page's LSN into newpage copy.  This will become the
     115                 :      * updated version of the page.  We need this because XLogInsert will
     116                 :      * examine the LSN and possibly dump it in a page image.
     117                 :      */
     118           28659 :     newpage = PageGetTempPageCopySpecial(page);
     119           28659 :     PageSetLSN(newpage, PageGetLSN(page));
     120                 : 
     121                 :     /* Copy high key, if any */
     122           28659 :     if (!P_RIGHTMOST(opaque))
     123                 :     {
     124           21027 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
     125           21027 :         Size        hitemsz = ItemIdGetLength(hitemid);
     126           21027 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
     127                 : 
     128           21027 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
     129                 :                         false, false) == InvalidOffsetNumber)
     130 UBC           0 :             elog(ERROR, "deduplication failed to add highkey");
     131                 :     }
     132                 : 
     133 CBC       28659 :     for (offnum = minoff;
     134         6486288 :          offnum <= maxoff;
     135         6457629 :          offnum = OffsetNumberNext(offnum))
     136                 :     {
     137         6457629 :         ItemId      itemid = PageGetItemId(page, offnum);
     138         6457629 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     139                 : 
     140         6457629 :         Assert(!ItemIdIsDead(itemid));
     141                 : 
     142         6457629 :         if (offnum == minoff)
     143                 :         {
     144                 :             /*
     145                 :              * No previous/base tuple for the data item -- use the data item
     146                 :              * as base tuple of pending posting list
     147                 :              */
     148           28659 :             _bt_dedup_start_pending(state, itup, offnum);
     149                 :         }
     150        12856912 :         else if (state->deduplicate &&
     151         7275837 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     152          847895 :                  _bt_dedup_save_htid(state, itup))
     153                 :         {
     154                 :             /*
     155                 :              * Tuple is equal to base tuple of pending posting list.  Heap
     156                 :              * TID(s) for itup have been saved in state.
     157                 :              */
     158                 :         }
     159                 :         else
     160                 :         {
     161                 :             /*
     162                 :              * Tuple is not equal to pending posting list tuple, or
     163                 :              * _bt_dedup_save_htid() opted to not merge current item into
     164                 :              * pending posting list for some other reason (e.g., adding more
     165                 :              * TIDs would have caused posting list to exceed current
     166                 :              * maxpostingsize).
     167                 :              *
     168                 :              * If state contains pending posting list with more than one item,
     169                 :              * form new posting tuple and add it to our temp page (newpage).
     170                 :              * Else add pending interval's base tuple to the temp page as-is.
     171                 :              */
     172         5591710 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
     173                 : 
     174         5591710 :             if (singlevalstrat)
     175                 :             {
     176                 :                 /*
     177                 :                  * Single value strategy's extra steps.
     178                 :                  *
     179                 :                  * Lower maxpostingsize for sixth and final large posting list
     180                 :                  * tuple at the point where 5 maxpostingsize-capped tuples
     181                 :                  * have either been formed or observed.
     182                 :                  *
     183                 :                  * When a sixth maxpostingsize-capped item is formed/observed,
     184                 :                  * stop merging together tuples altogether.  The few tuples
     185                 :                  * that remain at the end of the page won't be merged together
     186                 :                  * at all (at least not until after a future page split takes
     187                 :                  * place, when this page's newly allocated right sibling page
     188                 :                  * gets its first deduplication pass).
     189                 :                  */
     190 GIC        2380 :                 if (state->nmaxitems == 5)
     191 CBC         318 :                     _bt_singleval_fillfactor(page, state, newitemsz);
     192            2062 :                 else if (state->nmaxitems == 6)
     193 ECB             :                 {
     194 GIC         123 :                     state->deduplicate = false;
     195 CBC         123 :                     singlevalstrat = false; /* won't be back here */
     196 ECB             :                 }
     197                 :             }
     198                 : 
     199                 :             /* itup starts new pending posting list */
     200 GIC     5591710 :             _bt_dedup_start_pending(state, itup, offnum);
     201 ECB             :         }
     202                 :     }
     203                 : 
     204                 :     /* Handle the last item */
     205 GIC       28659 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
     206 ECB             : 
     207                 :     /*
     208                 :      * If no items suitable for deduplication were found, newpage must be
     209                 :      * exactly the same as the original page, so just return from function.
     210                 :      *
     211                 :      * We could determine whether or not to proceed on the basis the space
     212                 :      * savings being sufficient to avoid an immediate page split instead.  We
     213                 :      * don't do that because there is some small value in nbtsplitloc.c always
     214                 :      * operating against a page that is fully deduplicated (apart from
     215                 :      * newitem).  Besides, most of the cost has already been paid.
     216                 :      */
     217 GIC       28659 :     if (state->nintervals == 0)
     218 ECB             :     {
     219                 :         /* cannot leak memory here */
     220 GIC        3729 :         pfree(newpage);
     221 CBC        3729 :         pfree(state->htids);
     222            3729 :         pfree(state);
     223            3729 :         return;
     224 ECB             :     }
     225                 : 
     226                 :     /*
     227                 :      * By here, it's clear that deduplication will definitely go ahead.
     228                 :      *
     229                 :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
     230                 :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
     231                 :      * But keep things tidy.
     232                 :      */
     233 GIC       24930 :     if (P_HAS_GARBAGE(opaque))
     234 ECB             :     {
     235 UIC           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
     236 EUB             : 
     237 UIC           0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
     238 EUB             :     }
     239                 : 
     240 GIC       24930 :     START_CRIT_SECTION();
     241 ECB             : 
     242 GIC       24930 :     PageRestoreTempPage(newpage, page);
     243 CBC       24930 :     MarkBufferDirty(buf);
     244 ECB             : 
     245                 :     /* XLOG stuff */
     246 GIC       24930 :     if (RelationNeedsWAL(rel))
     247 ECB             :     {
     248                 :         XLogRecPtr  recptr;
     249                 :         xl_btree_dedup xlrec_dedup;
     250                 : 
     251 GIC       24867 :         xlrec_dedup.nintervals = state->nintervals;
     252 ECB             : 
     253 GIC       24867 :         XLogBeginInsert();
     254 CBC       24867 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
     255           24867 :         XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
     256 ECB             : 
     257                 :         /*
     258                 :          * The intervals array is not in the buffer, but pretend that it is.
     259                 :          * When XLogInsert stores the whole buffer, the array need not be
     260                 :          * stored too.
     261                 :          */
     262 GIC       24867 :         XLogRegisterBufData(0, (char *) state->intervals,
     263 CBC       24867 :                             state->nintervals * sizeof(BTDedupInterval));
     264 ECB             : 
     265 GIC       24867 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
     266 ECB             : 
     267 GIC       24867 :         PageSetLSN(page, recptr);
     268 ECB             :     }
     269                 : 
     270 GIC       24930 :     END_CRIT_SECTION();
     271 ECB             : 
     272                 :     /* Local space accounting should agree with page accounting */
     273 GIC       24930 :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
     274 ECB             : 
     275                 :     /* cannot leak memory here */
     276 GIC       24930 :     pfree(state->htids);
     277 CBC       24930 :     pfree(state);
     278 ECB             : }
     279                 : 
     280                 : /*
     281                 :  * Perform bottom-up index deletion pass.
     282                 :  *
     283                 :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
     284                 :  * be deleted via bottom-up index deletion.  The high level goal here is to
     285                 :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
     286                 :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
     287                 :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
     288                 :  * do not particularly care about once-off opportunities to delete many index
     289                 :  * tuples together.
     290                 :  *
     291                 :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
     292                 :  * See access/tableam.h for a description of how we're expected to cooperate
     293                 :  * with the tableam.
     294                 :  *
     295                 :  * Returns true on success, in which case caller can assume page split will be
     296                 :  * avoided for a reasonable amount of time.  Returns false when caller should
     297                 :  * deduplicate the page (if possible at all).
     298                 :  *
     299                 :  * Note: Occasionally we return true despite failing to delete enough items to
     300                 :  * avoid a split.  This makes caller skip deduplication and go split the page
     301                 :  * right away.  Our return value is always just advisory information.
     302                 :  *
     303                 :  * Note: Caller should have already deleted all existing items with their
     304                 :  * LP_DEAD bits set.
     305                 :  */
     306                 : bool
     307 GIC        3512 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
     308 ECB             :                      Size newitemsz)
     309                 : {
     310                 :     OffsetNumber offnum,
     311                 :                 minoff,
     312                 :                 maxoff;
     313 GIC        3512 :     Page        page = BufferGetPage(buf);
     314 CBC        3512 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     315 ECB             :     BTDedupState state;
     316                 :     TM_IndexDeleteOp delstate;
     317                 :     bool        neverdedup;
     318 GIC        3512 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     319 ECB             : 
     320                 :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
     321 GIC        3512 :     newitemsz += sizeof(ItemIdData);
     322 ECB             : 
     323                 :     /* Initialize deduplication state */
     324 GIC        3512 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
     325 CBC        3512 :     state->deduplicate = true;
     326            3512 :     state->nmaxitems = 0;
     327            3512 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
     328            3512 :     state->base = NULL;
     329            3512 :     state->baseoff = InvalidOffsetNumber;
     330            3512 :     state->basetupsize = 0;
     331            3512 :     state->htids = palloc(state->maxpostingsize);
     332            3512 :     state->nhtids = 0;
     333            3512 :     state->nitems = 0;
     334            3512 :     state->phystupsize = 0;
     335            3512 :     state->nintervals = 0;
     336 ECB             : 
     337                 :     /*
     338                 :      * Initialize tableam state that describes bottom-up index deletion
     339                 :      * operation.
     340                 :      *
     341                 :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
     342                 :      * can safely delete.  The tableam will search until our leaf page space
     343                 :      * target is satisfied, or until the cost of continuing with the tableam
     344                 :      * operation seems too high.  It focuses its efforts on TIDs associated
     345                 :      * with duplicate index tuples that we mark "promising".
     346                 :      *
     347                 :      * This space target is a little arbitrary.  The tableam must be able to
     348                 :      * keep the costs and benefits in balance.  We provide the tableam with
     349                 :      * exhaustive information about what might work, without directly
     350                 :      * concerning ourselves with avoiding work during the tableam call.  Our
     351                 :      * role in costing the bottom-up deletion process is strictly advisory.
     352                 :      */
     353 GIC        3512 :     delstate.irel = rel;
     354 CBC        3512 :     delstate.iblknum = BufferGetBlockNumber(buf);
     355            3512 :     delstate.bottomup = true;
     356            3512 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
     357            3512 :     delstate.ndeltids = 0;
     358            3512 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
     359            3512 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
     360 ECB             : 
     361 GIC        3512 :     minoff = P_FIRSTDATAKEY(opaque);
     362 CBC        3512 :     maxoff = PageGetMaxOffsetNumber(page);
     363            3512 :     for (offnum = minoff;
     364         1030423 :          offnum <= maxoff;
     365         1026911 :          offnum = OffsetNumberNext(offnum))
     366 ECB             :     {
     367 GIC     1026911 :         ItemId      itemid = PageGetItemId(page, offnum);
     368 CBC     1026911 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     369 ECB             : 
     370 GIC     1026911 :         Assert(!ItemIdIsDead(itemid));
     371 ECB             : 
     372 GIC     1026911 :         if (offnum == minoff)
     373 ECB             :         {
     374                 :             /* itup starts first pending interval */
     375 GIC        3512 :             _bt_dedup_start_pending(state, itup, offnum);
     376 ECB             :         }
     377 GIC     1177990 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
     378 CBC      154591 :                  _bt_dedup_save_htid(state, itup))
     379 ECB             :         {
     380                 :             /* Tuple is equal; just added its TIDs to pending interval */
     381                 :         }
     382                 :         else
     383                 :         {
     384                 :             /* Finalize interval -- move its TIDs to delete state */
     385 GIC      868808 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
     386 ECB             : 
     387                 :             /* itup starts new pending interval */
     388 GIC      868808 :             _bt_dedup_start_pending(state, itup, offnum);
     389 ECB             :         }
     390                 :     }
     391                 :     /* Finalize final interval -- move its TIDs to delete state */
     392 GIC        3512 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
     393 ECB             : 
     394                 :     /*
     395                 :      * We don't give up now in the event of having few (or even zero)
     396                 :      * promising tuples for the tableam because it's not up to us as the index
     397                 :      * AM to manage costs (note that the tableam might have heuristics of its
     398                 :      * own that work out what to do).  We should at least avoid having our
     399                 :      * caller do a useless deduplication pass after we return in the event of
     400                 :      * zero promising tuples, though.
     401                 :      */
     402 GIC        3512 :     neverdedup = false;
     403 CBC        3512 :     if (state->nintervals == 0)
     404               5 :         neverdedup = true;
     405 ECB             : 
     406 GIC        3512 :     pfree(state->htids);
     407 CBC        3512 :     pfree(state);
     408 ECB             : 
     409                 :     /* Ask tableam which TIDs are deletable, then physically delete them */
     410 GIC        3512 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
     411 ECB             : 
     412 GIC        3512 :     pfree(delstate.deltids);
     413 CBC        3512 :     pfree(delstate.status);
     414 ECB             : 
     415                 :     /* Report "success" to caller unconditionally to avoid deduplication */
     416 GIC        3512 :     if (neverdedup)
     417 CBC           5 :         return true;
     418 ECB             : 
     419                 :     /* Don't dedup when we won't end up back here any time soon anyway */
     420 GIC        3507 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
     421 ECB             : }
     422                 : 
     423                 : /*
     424                 :  * Create a new pending posting list tuple based on caller's base tuple.
     425                 :  *
     426                 :  * Every tuple processed by deduplication either becomes the base tuple for a
     427                 :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
     428                 :  * A tuple that starts out as the base tuple for a posting list will only
     429                 :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
     430                 :  * that there are duplicates that can be merged into the base tuple.
     431                 :  */
     432                 : void
     433 GIC     9274749 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
     434 ECB             :                         OffsetNumber baseoff)
     435                 : {
     436 GIC     9274749 :     Assert(state->nhtids == 0);
     437 CBC     9274749 :     Assert(state->nitems == 0);
     438         9274749 :     Assert(!BTreeTupleIsPivot(base));
     439 ECB             : 
     440                 :     /*
     441                 :      * Copy heap TID(s) from new base tuple for new candidate posting list
     442                 :      * into working state's array
     443                 :      */
     444 GIC     9274749 :     if (!BTreeTupleIsPosting(base))
     445 ECB             :     {
     446 GIC     7434111 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
     447 CBC     7434111 :         state->nhtids = 1;
     448         7434111 :         state->basetupsize = IndexTupleSize(base);
     449 ECB             :     }
     450                 :     else
     451                 :     {
     452                 :         int         nposting;
     453                 : 
     454 GIC     1840638 :         nposting = BTreeTupleGetNPosting(base);
     455 CBC     1840638 :         memcpy(state->htids, BTreeTupleGetPosting(base),
     456 ECB             :                sizeof(ItemPointerData) * nposting);
     457 GIC     1840638 :         state->nhtids = nposting;
     458 ECB             :         /* basetupsize should not include existing posting list */
     459 GIC     1840638 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
     460 ECB             :     }
     461                 : 
     462                 :     /*
     463                 :      * Save new base tuple itself -- it'll be needed if we actually create a
     464                 :      * new posting list from new pending posting list.
     465                 :      *
     466                 :      * Must maintain physical size of all existing tuples (including line
     467                 :      * pointer overhead) so that we can calculate space savings on page.
     468                 :      */
     469 GIC     9274749 :     state->nitems = 1;
     470 CBC     9274749 :     state->base = base;
     471         9274749 :     state->baseoff = baseoff;
     472         9274749 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
     473 ECB             :     /* Also save baseoff in pending state for interval */
     474 GIC     9274749 :     state->intervals[state->nintervals].baseoff = state->baseoff;
     475 CBC     9274749 : }
     476 ECB             : 
     477                 : /*
     478                 :  * Save itup heap TID(s) into pending posting list where possible.
     479                 :  *
     480                 :  * Returns bool indicating if the pending posting list managed by state now
     481                 :  * includes itup's heap TID(s).
     482                 :  */
     483                 : bool
     484 GIC     1749807 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
     485 ECB             : {
     486                 :     int         nhtids;
     487                 :     ItemPointer htids;
     488                 :     Size        mergedtupsz;
     489                 : 
     490 GIC     1749807 :     Assert(!BTreeTupleIsPivot(itup));
     491 ECB             : 
     492 GIC     1749807 :     if (!BTreeTupleIsPosting(itup))
     493 ECB             :     {
     494 GIC     1739831 :         nhtids = 1;
     495 CBC     1739831 :         htids = &itup->t_tid;
     496 ECB             :     }
     497                 :     else
     498                 :     {
     499 GIC        9976 :         nhtids = BTreeTupleGetNPosting(itup);
     500 CBC        9976 :         htids = BTreeTupleGetPosting(itup);
     501 ECB             :     }
     502                 : 
     503                 :     /*
     504                 :      * Don't append (have caller finish pending posting list as-is) if
     505                 :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
     506                 :      *
     507                 :      * This calculation needs to match the code used within _bt_form_posting()
     508                 :      * for new posting list tuples.
     509                 :      */
     510 GIC     1749807 :     mergedtupsz = MAXALIGN(state->basetupsize +
     511 ECB             :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
     512                 : 
     513 GIC     1749807 :     if (mergedtupsz > state->maxpostingsize)
     514 ECB             :     {
     515                 :         /*
     516                 :          * Count this as an oversized item for single value strategy, though
     517                 :          * only when there are 50 TIDs in the final posting list tuple.  This
     518                 :          * limit (which is fairly arbitrary) avoids confusion about how many
     519                 :          * 1/6 of a page tuples have been encountered/created by the current
     520                 :          * deduplication pass.
     521                 :          *
     522                 :          * Note: We deliberately don't consider which deduplication pass
     523                 :          * merged together tuples to create this item (could be a previous
     524                 :          * deduplication pass, or current pass).  See _bt_do_singleval()
     525                 :          * comments.
     526                 :          */
     527 GIC       14378 :         if (state->nhtids > 50)
     528 CBC       13882 :             state->nmaxitems++;
     529 ECB             : 
     530 GIC       14378 :         return false;
     531 ECB             :     }
     532                 : 
     533                 :     /*
     534                 :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
     535                 :      * pending posting list
     536                 :      */
     537 GIC     1735429 :     state->nitems++;
     538 CBC     1735429 :     memcpy(state->htids + state->nhtids, htids,
     539 ECB             :            sizeof(ItemPointerData) * nhtids);
     540 GIC     1735429 :     state->nhtids += nhtids;
     541 CBC     1735429 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
     542 ECB             : 
     543 GIC     1735429 :     return true;
     544 ECB             : }
     545                 : 
     546                 : /*
     547                 :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
     548                 :  * is based on saved base tuple, and saved list of heap TIDs.
     549                 :  *
     550                 :  * Returns space saving from deduplicating to make a new posting list tuple.
     551                 :  * Note that this includes line pointer overhead.  This is zero in the case
     552                 :  * where no deduplication was possible.
     553                 :  */
     554                 : Size
     555 GIC     6000318 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
     556 ECB             : {
     557                 :     OffsetNumber tupoff;
     558                 :     Size        tuplesz;
     559                 :     Size        spacesaving;
     560                 : 
     561 GIC     6000318 :     Assert(state->nitems > 0);
     562 CBC     6000318 :     Assert(state->nitems <= state->nhtids);
     563         6000318 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     564 ECB             : 
     565 GIC     6000318 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
     566 CBC     6000318 :     if (state->nitems == 1)
     567 ECB             :     {
     568                 :         /* Use original, unchanged base tuple */
     569 GIC     5657980 :         tuplesz = IndexTupleSize(state->base);
     570 GNC     5657980 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
     571         5657980 :         Assert(tuplesz <= BTMaxItemSize(newpage));
     572 CBC     5657980 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
     573 ECB             :                         false, false) == InvalidOffsetNumber)
     574 LBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     575 ECB             : 
     576 GIC     5657980 :         spacesaving = 0;
     577 EUB             :     }
     578                 :     else
     579 ECB             :     {
     580                 :         IndexTuple  final;
     581                 : 
     582                 :         /* Form a tuple with a posting list */
     583 GIC      342338 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
     584          342338 :         tuplesz = IndexTupleSize(final);
     585          342338 :         Assert(tuplesz <= state->maxpostingsize);
     586 ECB             : 
     587                 :         /* Save final number of items for posting list */
     588 CBC      342338 :         state->intervals[state->nintervals].nitems = state->nitems;
     589                 : 
     590 GIC      342338 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
     591 GNC      342338 :         Assert(tuplesz <= BTMaxItemSize(newpage));
     592 CBC      342338 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
     593                 :                         false) == InvalidOffsetNumber)
     594 LBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
     595 ECB             : 
     596 CBC      342338 :         pfree(final);
     597 GIC      342338 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
     598 EUB             :         /* Increment nintervals, since we wrote a new posting list tuple */
     599 GIC      342338 :         state->nintervals++;
     600 CBC      342338 :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
     601 ECB             :     }
     602                 : 
     603                 :     /* Reset state for next pending posting list */
     604 CBC     6000318 :     state->nhtids = 0;
     605 GIC     6000318 :     state->nitems = 0;
     606         6000318 :     state->phystupsize = 0;
     607                 : 
     608 CBC     6000318 :     return spacesaving;
     609 ECB             : }
     610                 : 
     611                 : /*
     612                 :  * Finalize interval during bottom-up index deletion.
     613                 :  *
     614                 :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
     615                 :  * first, and then get moved over to delstate (in variable-sized batches) by
     616                 :  * calling here.  Call here happens when the number of TIDs in a dedup
     617                 :  * interval is known, and interval gets finalized (i.e. when caller sees next
     618                 :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
     619                 :  * process from leaf page).
     620                 :  *
     621                 :  * This is where bottom-up deletion determines and remembers which entries are
     622                 :  * duplicates.  This will be important information to the tableam delete
     623                 :  * infrastructure later on.  Plain index tuple duplicates are marked
     624                 :  * "promising" here, per tableam contract.
     625                 :  *
     626                 :  * Our approach to marking entries whose TIDs come from posting lists is more
     627                 :  * complicated.  Posting lists can only be formed by a deduplication pass (or
     628                 :  * during an index build), so recent version churn affecting the pointed-to
     629                 :  * logical rows is not particularly likely.  We may still give a weak signal
     630                 :  * about posting list tuples' entries (by marking just one of its TIDs/entries
     631                 :  * promising), though this is only a possibility in the event of further
     632                 :  * duplicate index tuples in final interval that covers posting list tuple (as
     633                 :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
     634                 :  * when it has no stronger signal to go with for the deletion operation as a
     635                 :  * whole.
     636                 :  *
     637                 :  * The heuristics we use work well in practice because we only need to give
     638                 :  * the tableam the right _general_ idea about where to look.  Garbage tends to
     639                 :  * naturally get concentrated in relatively few table blocks with workloads
     640                 :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
     641                 :  * available table blocks sensibly based on the hints we provide, but that's
     642                 :  * okay -- only the extremes matter.  The tableam just needs to be able to
     643                 :  * predict which few table blocks will have the most tuples that are safe to
     644                 :  * delete for each deletion operation, with low variance across related
     645                 :  * deletion operations.
     646                 :  */
     647                 : static void
     648 GIC      872320 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
     649                 :                                TM_IndexDeleteOp *delstate)
     650                 : {
     651          872320 :     bool        dupinterval = (state->nitems > 1);
     652 ECB             : 
     653 GIC      872320 :     Assert(state->nitems > 0);
     654          872320 :     Assert(state->nitems <= state->nhtids);
     655 CBC      872320 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
     656                 : 
     657         1899231 :     for (int i = 0; i < state->nitems; i++)
     658 ECB             :     {
     659 CBC     1026911 :         OffsetNumber offnum = state->baseoff + i;
     660 GIC     1026911 :         ItemId      itemid = PageGetItemId(page, offnum);
     661 CBC     1026911 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
     662 GIC     1026911 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
     663 CBC     1026911 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
     664 ECB             : 
     665 CBC     1026911 :         if (!BTreeTupleIsPosting(itup))
     666 ECB             :         {
     667                 :             /* Simple case: A plain non-pivot tuple */
     668 GIC      858455 :             ideltid->tid = itup->t_tid;
     669 CBC      858455 :             ideltid->id = delstate->ndeltids;
     670 GIC      858455 :             istatus->idxoffnum = offnum;
     671          858455 :             istatus->knowndeletable = false; /* for now */
     672 CBC      858455 :             istatus->promising = dupinterval;    /* simple rule */
     673          858455 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
     674 ECB             : 
     675 CBC      858455 :             delstate->ndeltids++;
     676 ECB             :         }
     677                 :         else
     678                 :         {
     679                 :             /*
     680                 :              * Complicated case: A posting list tuple.
     681                 :              *
     682                 :              * We make the conservative assumption that there can only be at
     683                 :              * most one affected logical row per posting list tuple.  There
     684                 :              * will be at most one promising entry in deltids to represent
     685                 :              * this presumed lone logical row.  Note that this isn't even
     686                 :              * considered unless the posting list tuple is also in an interval
     687                 :              * of duplicates -- this complicated rule is just a variant of the
     688                 :              * simple rule used to decide if plain index tuples are promising.
     689                 :              */
     690 GIC      168456 :             int         nitem = BTreeTupleGetNPosting(itup);
     691          168456 :             bool        firstpromising = false;
     692          168456 :             bool        lastpromising = false;
     693                 : 
     694 CBC      168456 :             Assert(_bt_posting_valid(itup));
     695 ECB             : 
     696 CBC      168456 :             if (dupinterval)
     697                 :             {
     698 ECB             :                 /*
     699                 :                  * Complicated rule: either the first or last TID in the
     700                 :                  * posting list gets marked promising (if any at all)
     701                 :                  */
     702                 :                 BlockNumber minblocklist,
     703                 :                             midblocklist,
     704                 :                             maxblocklist;
     705                 :                 ItemPointer mintid,
     706                 :                             midtid,
     707                 :                             maxtid;
     708                 : 
     709 GIC       10036 :                 mintid = BTreeTupleGetHeapTID(itup);
     710           10036 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
     711           10036 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
     712           10036 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
     713 CBC       10036 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
     714           10036 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
     715 ECB             : 
     716                 :                 /* Only entry with predominant table block can be promising */
     717 CBC       10036 :                 firstpromising = (minblocklist == midblocklist);
     718           10036 :                 lastpromising = (!firstpromising &&
     719                 :                                  midblocklist == maxblocklist);
     720                 :             }
     721 ECB             : 
     722 CBC      791085 :             for (int p = 0; p < nitem; p++)
     723                 :             {
     724 GIC      622629 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
     725                 : 
     726 CBC      622629 :                 ideltid->tid = *htid;
     727 GIC      622629 :                 ideltid->id = delstate->ndeltids;
     728 CBC      622629 :                 istatus->idxoffnum = offnum;
     729 GIC      622629 :                 istatus->knowndeletable = false; /* for now */
     730 CBC      622629 :                 istatus->promising = false;
     731          622629 :                 if ((firstpromising && p == 0) ||
     732           68395 :                     (lastpromising && p == nitem - 1))
     733            6802 :                     istatus->promising = true;
     734          622629 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
     735 ECB             : 
     736 CBC      622629 :                 ideltid++;
     737          622629 :                 istatus++;
     738          622629 :                 delstate->ndeltids++;
     739                 :             }
     740 ECB             :         }
     741                 :     }
     742                 : 
     743 GIC      872320 :     if (dupinterval)
     744                 :     {
     745           95295 :         state->intervals[state->nintervals].nitems = state->nitems;
     746           95295 :         state->nintervals++;
     747 ECB             :     }
     748                 : 
     749                 :     /* Reset state for next interval */
     750 CBC      872320 :     state->nhtids = 0;
     751 GIC      872320 :     state->nitems = 0;
     752          872320 :     state->phystupsize = 0;
     753          872320 : }
     754 ECB             : 
     755                 : /*
     756                 :  * Determine if page non-pivot tuples (data items) are all duplicates of the
     757                 :  * same value -- if they are, deduplication's "single value" strategy should
     758                 :  * be applied.  The general goal of this strategy is to ensure that
     759                 :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
     760                 :  * split point as further duplicates are inserted, and successive rightmost
     761                 :  * page splits occur among pages that store the same duplicate value.  When
     762                 :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
     763                 :  * just like it would if deduplication were disabled.
     764                 :  *
     765                 :  * We expect that affected workloads will require _several_ single value
     766                 :  * strategy deduplication passes (over a page that only stores duplicates)
     767                 :  * before the page is finally split.  The first deduplication pass should only
     768                 :  * find regular non-pivot tuples.  Later deduplication passes will find
     769                 :  * existing maxpostingsize-capped posting list tuples, which must be skipped
     770                 :  * over.  The penultimate pass is generally the first pass that actually
     771                 :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
     772                 :  * few untouched non-pivot tuples.  The final deduplication pass won't free
     773                 :  * any space -- it will skip over everything without merging anything (it
     774                 :  * retraces the steps of the penultimate pass).
     775                 :  *
     776                 :  * Fortunately, having several passes isn't too expensive.  Each pass (after
     777                 :  * the first pass) won't spend many cycles on the large posting list tuples
     778                 :  * left by previous passes.  Each pass will find a large contiguous group of
     779                 :  * smaller duplicate tuples to merge together at the end of the page.
     780                 :  */
     781                 : static bool
     782 GIC       25291 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
     783                 :                  OffsetNumber minoff, IndexTuple newitem)
     784                 : {
     785           25291 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
     786 ECB             :     ItemId      itemid;
     787                 :     IndexTuple  itup;
     788                 : 
     789 CBC       25291 :     itemid = PageGetItemId(page, minoff);
     790 GIC       25291 :     itup = (IndexTuple) PageGetItem(page, itemid);
     791                 : 
     792           25291 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     793 ECB             :     {
     794 CBC         926 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
     795 GIC         926 :         itup = (IndexTuple) PageGetItem(page, itemid);
     796 ECB             : 
     797 GIC         926 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
     798 CBC         568 :             return true;
     799 ECB             :     }
     800                 : 
     801 CBC       24723 :     return false;
     802 ECB             : }
     803                 : 
     804                 : /*
     805                 :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
     806                 :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
     807                 :  * tuple will end up somewhat smaller than the first five.  (Note: The first
     808                 :  * five tuples could actually just be very large duplicate tuples that
     809                 :  * couldn't be merged together at all.  Deduplication will simply not modify
     810                 :  * the page when that happens.)
     811                 :  *
     812                 :  * When there are six posting lists on the page (after current deduplication
     813                 :  * pass goes on to create/observe a sixth very large tuple), caller should end
     814                 :  * its deduplication pass.  It isn't useful to try to deduplicate items that
     815                 :  * are supposed to end up on the new right sibling page following the
     816                 :  * anticipated page split.  A future deduplication pass of future right
     817                 :  * sibling page might take care of it.  (This is why the first single value
     818                 :  * strategy deduplication pass for a given leaf page will generally find only
     819                 :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
     820                 :  */
     821                 : static void
     822 GIC         318 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
     823                 : {
     824                 :     Size        leftfree;
     825                 :     int         reduction;
     826 ECB             : 
     827                 :     /* This calculation needs to match nbtsplitloc.c */
     828 GIC         318 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
     829                 :         MAXALIGN(sizeof(BTPageOpaqueData));
     830                 :     /* Subtract size of new high key (includes pivot heap TID space) */
     831             318 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
     832 ECB             : 
     833                 :     /*
     834                 :      * Reduce maxpostingsize by an amount equal to target free space on left
     835                 :      * half of page
     836                 :      */
     837 GIC         318 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
     838             318 :     if (state->maxpostingsize > reduction)
     839             318 :         state->maxpostingsize -= reduction;
     840                 :     else
     841 LBC           0 :         state->maxpostingsize = 0;
     842 CBC         318 : }
     843 ECB             : 
     844                 : /*
     845 EUB             :  * Build a posting list tuple based on caller's "base" index tuple and list of
     846 ECB             :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
     847                 :  * posting list. (Posting list tuples can never have a single heap TID, partly
     848                 :  * because that ensures that deduplication always reduces final MAXALIGN()'d
     849                 :  * size of entire tuple.)
     850                 :  *
     851                 :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
     852                 :  * than a SHORTALIGN()'d offset), in line with the approach taken when
     853                 :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
     854                 :  * This sometimes wastes a little space that was only needed as alignment
     855                 :  * padding in the original tuple.  Following this convention simplifies the
     856                 :  * space accounting used when deduplicating a page (the same convention
     857                 :  * simplifies the accounting for choosing a point to split a page at).
     858                 :  *
     859                 :  * Note: Caller's "htids" array must be unique and already in ascending TID
     860                 :  * order.  Any existing heap TIDs from "base" won't automatically appear in
     861                 :  * returned posting list tuple (they must be included in htids array.)
     862                 :  */
     863                 : IndexTuple
     864 GIC      381463 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
     865                 : {
     866                 :     uint32      keysize,
     867                 :                 newsize;
     868 ECB             :     IndexTuple  itup;
     869                 : 
     870 GIC      381463 :     if (BTreeTupleIsPosting(base))
     871           69051 :         keysize = BTreeTupleGetPostingOffset(base);
     872                 :     else
     873          312412 :         keysize = IndexTupleSize(base);
     874 ECB             : 
     875 CBC      381463 :     Assert(!BTreeTupleIsPivot(base));
     876 GIC      381463 :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
     877 CBC      381463 :     Assert(keysize == MAXALIGN(keysize));
     878                 : 
     879 ECB             :     /* Determine final size of new tuple */
     880 CBC      381463 :     if (nhtids > 1)
     881          372094 :         newsize = MAXALIGN(keysize +
     882                 :                            nhtids * sizeof(ItemPointerData));
     883                 :     else
     884            9369 :         newsize = keysize;
     885 ECB             : 
     886 GIC      381463 :     Assert(newsize <= INDEX_SIZE_MASK);
     887          381463 :     Assert(newsize == MAXALIGN(newsize));
     888 ECB             : 
     889                 :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     890 CBC      381463 :     itup = palloc0(newsize);
     891          381463 :     memcpy(itup, base, keysize);
     892 GIC      381463 :     itup->t_info &= ~INDEX_SIZE_MASK;
     893          381463 :     itup->t_info |= newsize;
     894 CBC      381463 :     if (nhtids > 1)
     895 ECB             :     {
     896                 :         /* Form posting list tuple */
     897 CBC      372094 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     898          372094 :         memcpy(BTreeTupleGetPosting(itup), htids,
     899                 :                sizeof(ItemPointerData) * nhtids);
     900 GIC      372094 :         Assert(_bt_posting_valid(itup));
     901 ECB             :     }
     902                 :     else
     903                 :     {
     904                 :         /* Form standard non-pivot tuple */
     905 GIC        9369 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     906            9369 :         ItemPointerCopy(htids, &itup->t_tid);
     907            9369 :         Assert(ItemPointerIsValid(&itup->t_tid));
     908                 :     }
     909 ECB             : 
     910 CBC      381463 :     return itup;
     911 ECB             : }
     912                 : 
     913                 : /*
     914                 :  * Generate a replacement tuple by "updating" a posting list tuple so that it
     915                 :  * no longer has TIDs that need to be deleted.
     916                 :  *
     917                 :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
     918                 :  * points to the existing posting list tuple to be updated.
     919                 :  *
     920                 :  * On return, caller's vacposting argument will point to final "updated"
     921                 :  * tuple, which will be palloc()'d in caller's memory context.
     922                 :  */
     923                 : void
     924 GIC       87661 : _bt_update_posting(BTVacuumPosting vacposting)
     925                 : {
     926           87661 :     IndexTuple  origtuple = vacposting->itup;
     927                 :     uint32      keysize,
     928 ECB             :                 newsize;
     929                 :     IndexTuple  itup;
     930                 :     int         nhtids;
     931                 :     int         ui,
     932                 :                 d;
     933                 :     ItemPointer htids;
     934                 : 
     935 GIC       87661 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
     936                 : 
     937           87661 :     Assert(_bt_posting_valid(origtuple));
     938           87661 :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
     939 ECB             : 
     940                 :     /*
     941                 :      * Determine final size of new tuple.
     942                 :      *
     943                 :      * This calculation needs to match the code used within _bt_form_posting()
     944                 :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
     945                 :      * to save ourselves a second memory allocation for a htids workspace.
     946                 :      */
     947 GIC       87661 :     keysize = BTreeTupleGetPostingOffset(origtuple);
     948           87661 :     if (nhtids > 1)
     949            6637 :         newsize = MAXALIGN(keysize +
     950                 :                            nhtids * sizeof(ItemPointerData));
     951 ECB             :     else
     952 CBC       81024 :         newsize = keysize;
     953 ECB             : 
     954 GIC       87661 :     Assert(newsize <= INDEX_SIZE_MASK);
     955           87661 :     Assert(newsize == MAXALIGN(newsize));
     956 ECB             : 
     957                 :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
     958 CBC       87661 :     itup = palloc0(newsize);
     959           87661 :     memcpy(itup, origtuple, keysize);
     960 GIC       87661 :     itup->t_info &= ~INDEX_SIZE_MASK;
     961           87661 :     itup->t_info |= newsize;
     962 ECB             : 
     963 CBC       87661 :     if (nhtids > 1)
     964 ECB             :     {
     965                 :         /* Form posting list tuple */
     966 GIC        6637 :         BTreeTupleSetPosting(itup, nhtids, keysize);
     967 CBC        6637 :         htids = BTreeTupleGetPosting(itup);
     968                 :     }
     969                 :     else
     970 ECB             :     {
     971                 :         /* Form standard non-pivot tuple */
     972 GIC       81024 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
     973           81024 :         htids = &itup->t_tid;
     974                 :     }
     975                 : 
     976 CBC       87661 :     ui = 0;
     977           87661 :     d = 0;
     978 GIC      372085 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
     979                 :     {
     980 CBC      284424 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
     981 ECB             :         {
     982 CBC      120806 :             d++;
     983 GIC      120806 :             continue;
     984 ECB             :         }
     985 GIC      163618 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
     986 ECB             :     }
     987 CBC       87661 :     Assert(ui == nhtids);
     988 GIC       87661 :     Assert(d == vacposting->ndeletedtids);
     989 CBC       87661 :     Assert(nhtids == 1 || _bt_posting_valid(itup));
     990 GIC       87661 :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
     991 ECB             : 
     992                 :     /* vacposting arg's itup will now point to updated version */
     993 CBC       87661 :     vacposting->itup = itup;
     994           87661 : }
     995                 : 
     996                 : /*
     997 ECB             :  * Prepare for a posting list split by swapping heap TID in newitem with heap
     998                 :  * TID from original posting list (the 'oposting' heap TID located at offset
     999                 :  * 'postingoff').  Modifies newitem, so caller should pass their own private
    1000                 :  * copy that can safely be modified.
    1001                 :  *
    1002                 :  * Returns new posting list tuple, which is palloc()'d in caller's context.
    1003                 :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
    1004                 :  * what caller actually inserts. (This happens inside the same critical
    1005                 :  * section that performs an in-place update of old posting list using new
    1006                 :  * posting list returned here.)
    1007                 :  *
    1008                 :  * While the keys from newitem and oposting must be opclass equal, and must
    1009                 :  * generate identical output when run through the underlying type's output
    1010                 :  * function, it doesn't follow that their representations match exactly.
    1011                 :  * Caller must avoid assuming that there can't be representational differences
    1012                 :  * that make datums from oposting bigger or smaller than the corresponding
    1013                 :  * datums from newitem.  For example, differences in TOAST input state might
    1014                 :  * break a faulty assumption about tuple size (the executor is entitled to
    1015                 :  * apply TOAST compression based on its own criteria).  It also seems possible
    1016                 :  * that further representational variation will be introduced in the future,
    1017                 :  * in order to support nbtree features like page-level prefix compression.
    1018                 :  *
    1019                 :  * See nbtree/README for details on the design of posting list splits.
    1020                 :  */
    1021                 : IndexTuple
    1022 GIC        9794 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
    1023                 : {
    1024                 :     int         nhtids;
    1025                 :     char       *replacepos;
    1026 ECB             :     char       *replaceposright;
    1027                 :     Size        nmovebytes;
    1028                 :     IndexTuple  nposting;
    1029                 : 
    1030 GIC        9794 :     nhtids = BTreeTupleGetNPosting(oposting);
    1031            9794 :     Assert(_bt_posting_valid(oposting));
    1032                 : 
    1033                 :     /*
    1034 ECB             :      * The postingoff argument originated as a _bt_binsrch_posting() return
    1035                 :      * value.  It will be 0 in the event of corruption that makes a leaf page
    1036                 :      * contain a non-pivot tuple that's somehow identical to newitem (no two
    1037                 :      * non-pivot tuples should ever have the same TID).  This has been known
    1038                 :      * to happen in the field from time to time.
    1039                 :      *
    1040                 :      * Perform a basic sanity check to catch this case now.
    1041                 :      */
    1042 GIC        9794 :     if (!(postingoff > 0 && postingoff < nhtids))
    1043 UIC           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
    1044                 :              nhtids, postingoff);
    1045                 : 
    1046 ECB             :     /*
    1047 EUB             :      * Move item pointers in posting list to make a gap for the new item's
    1048                 :      * heap TID.  We shift TIDs one place to the right, losing original
    1049                 :      * rightmost TID. (nmovebytes must not include TIDs to the left of
    1050                 :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
    1051                 :      */
    1052 GIC        9794 :     nposting = CopyIndexTuple(oposting);
    1053            9794 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
    1054            9794 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
    1055            9794 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
    1056 CBC        9794 :     memmove(replaceposright, replacepos, nmovebytes);
    1057 ECB             : 
    1058                 :     /* Fill the gap at postingoff with TID of new item (original new TID) */
    1059 CBC        9794 :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
    1060            9794 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
    1061                 : 
    1062                 :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
    1063            9794 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
    1064 ECB             : 
    1065 GIC        9794 :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
    1066                 :                               BTreeTupleGetHeapTID(newitem)) < 0);
    1067 CBC        9794 :     Assert(_bt_posting_valid(nposting));
    1068                 : 
    1069            9794 :     return nposting;
    1070                 : }
    1071 ECB             : 
    1072                 : /*
    1073                 :  * Verify posting list invariants for "posting", which must be a posting list
    1074                 :  * tuple.  Used within assertions.
    1075                 :  */
    1076                 : #ifdef USE_ASSERT_CHECKING
    1077                 : static bool
    1078 GIC      654436 : _bt_posting_valid(IndexTuple posting)
    1079                 : {
    1080                 :     ItemPointerData last;
    1081                 :     ItemPointer htid;
    1082 ECB             : 
    1083 GIC      654436 :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
    1084 UIC           0 :         return false;
    1085                 : 
    1086                 :     /* Remember first heap TID for loop */
    1087 CBC      654436 :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
    1088 GBC      654436 :     if (!ItemPointerIsValid(&last))
    1089 UIC           0 :         return false;
    1090                 : 
    1091 ECB             :     /* Iterate, starting from second TID */
    1092 CBC     7657174 :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
    1093 EUB             :     {
    1094 GIC     7002738 :         htid = BTreeTupleGetPostingN(posting, i);
    1095                 : 
    1096 CBC     7002738 :         if (!ItemPointerIsValid(htid))
    1097 UIC           0 :             return false;
    1098 CBC     7002738 :         if (ItemPointerCompare(htid, &last) <= 0)
    1099 UIC           0 :             return false;
    1100 CBC     7002738 :         ItemPointerCopy(htid, &last);
    1101 EUB             :     }
    1102 ECB             : 
    1103 GBC      654436 :     return true;
    1104 ECB             : }
    1105                 : #endif
        

Generated by: LCOV version v1.16-55-g56c0a2a