LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtdedup.c (source / functions) Coverage Total Hit UBC CBC
Current: Differential Code Coverage 16@8cea358b128 vs 17@8cea358b128 Lines: 96.8 % 346 335 11 335
Current Date: 2024-04-14 14:21:10 Functions: 100.0 % 12 12 12
Baseline: 16@8cea358b128 Branches: 70.2 % 228 160 68 160
Baseline Date: 2024-04-14 14:21:09 Line coverage date bins:
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed (240..) days: 96.8 % 346 335 11 335
Function coverage date bins:
(240..) days: 100.0 % 12 12 12
Branch coverage date bins:
(240..) days: 70.2 % 228 160 68 160

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * nbtdedup.c
                                  4                 :                :  *    Deduplicate or bottom-up delete items in Postgres btrees.
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/access/nbtree/nbtdedup.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : #include "postgres.h"
                                 16                 :                : 
                                 17                 :                : #include "access/nbtree.h"
                                 18                 :                : #include "access/nbtxlog.h"
                                 19                 :                : #include "access/xloginsert.h"
                                 20                 :                : #include "miscadmin.h"
                                 21                 :                : #include "utils/rel.h"
                                 22                 :                : 
                                 23                 :                : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                 24                 :                :                                            TM_IndexDeleteOp *delstate);
                                 25                 :                : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                 26                 :                :                              OffsetNumber minoff, IndexTuple newitem);
                                 27                 :                : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
                                 28                 :                :                                      Size newitemsz);
                                 29                 :                : #ifdef USE_ASSERT_CHECKING
                                 30                 :                : static bool _bt_posting_valid(IndexTuple posting);
                                 31                 :                : #endif
                                 32                 :                : 
                                 33                 :                : /*
                                 34                 :                :  * Perform a deduplication pass.
                                 35                 :                :  *
                                 36                 :                :  * The general approach taken here is to perform as much deduplication as
                                 37                 :                :  * possible to free as much space as possible.  Note, however, that "single
                                 38                 :                :  * value" strategy is used for !bottomupdedup callers when the page is full of
                                 39                 :                :  * tuples of a single value.  Deduplication passes that apply the strategy
                                 40                 :                :  * will leave behind a few untouched tuples at the end of the page, preparing
                                 41                 :                :  * the page for an anticipated page split that uses nbtsplitloc.c's own single
                                 42                 :                :  * value strategy.  Our high level goal is to delay merging the untouched
                                 43                 :                :  * tuples until after the page splits.
                                 44                 :                :  *
                                 45                 :                :  * When a call to _bt_bottomupdel_pass() just took place (and failed), our
                                 46                 :                :  * high level goal is to prevent a page split entirely by buying more time.
                                 47                 :                :  * We still hope that a page split can be avoided altogether.  That's why
                                 48                 :                :  * single value strategy is not even considered for bottomupdedup callers.
                                 49                 :                :  *
                                 50                 :                :  * The page will have to be split if we cannot successfully free at least
                                 51                 :                :  * newitemsz (we also need space for newitem's line pointer, which isn't
                                 52                 :                :  * included in caller's newitemsz).
                                 53                 :                :  *
                                 54                 :                :  * Note: Caller should have already deleted all existing items with their
                                 55                 :                :  * LP_DEAD bits set.
                                 56                 :                :  */
                                 57                 :                : void
  362 pg@bowt.ie                 58                 :CBC       11769 : _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz,
                                 59                 :                :                bool bottomupdedup)
                                 60                 :                : {
                                 61                 :                :     OffsetNumber offnum,
                                 62                 :                :                 minoff,
                                 63                 :                :                 maxoff;
 1509                            64                 :          11769 :     Page        page = BufferGetPage(buf);
  744 michael@paquier.xyz        65                 :          11769 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                                 66                 :                :     Page        newpage;
                                 67                 :                :     BTDedupState state;
  739 tgl@sss.pgh.pa.us          68                 :          11769 :     Size        pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
 1509 pg@bowt.ie                 69                 :          11769 :     bool        singlevalstrat = false;
 1478                            70                 :          11769 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                 71                 :                : 
                                 72                 :                :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
 1509                            73                 :          11769 :     newitemsz += sizeof(ItemIdData);
                                 74                 :                : 
                                 75                 :                :     /*
                                 76                 :                :      * Initialize deduplication state.
                                 77                 :                :      *
                                 78                 :                :      * It would be possible for maxpostingsize (limit on posting list tuple
                                 79                 :                :      * size) to be set to one third of the page.  However, it seems like a
                                 80                 :                :      * good idea to limit the size of posting lists to one sixth of a page.
                                 81                 :                :      * That ought to leave us with a good split point when pages full of
                                 82                 :                :      * duplicates can be split several times.
                                 83                 :                :      */
                                 84                 :          11769 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
                                 85                 :          11769 :     state->deduplicate = true;
 1395                            86                 :          11769 :     state->nmaxitems = 0;
 1509                            87         [ +  - ]:          11769 :     state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
                                 88                 :                :     /* Metadata about base tuple of current pending posting list */
                                 89                 :          11769 :     state->base = NULL;
                                 90                 :          11769 :     state->baseoff = InvalidOffsetNumber;
                                 91                 :          11769 :     state->basetupsize = 0;
                                 92                 :                :     /* Metadata about current pending posting list TIDs */
                                 93                 :          11769 :     state->htids = palloc(state->maxpostingsize);
                                 94                 :          11769 :     state->nhtids = 0;
                                 95                 :          11769 :     state->nitems = 0;
                                 96                 :                :     /* Size of all physical tuples to be replaced by pending posting list */
                                 97                 :          11769 :     state->phystupsize = 0;
                                 98                 :                :     /* nintervals should be initialized to zero */
                                 99                 :          11769 :     state->nintervals = 0;
                                100                 :                : 
 1244                           101         [ +  + ]:          11769 :     minoff = P_FIRSTDATAKEY(opaque);
                                102                 :          11769 :     maxoff = PageGetMaxOffsetNumber(page);
                                103                 :                : 
                                104                 :                :     /*
                                105                 :                :      * Consider applying "single value" strategy, though only if the page
                                106                 :                :      * seems likely to be split in the near future
                                107                 :                :      */
  936                           108         [ +  + ]:          11769 :     if (!bottomupdedup)
 1509                           109                 :          10208 :         singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
                                110                 :                : 
                                111                 :                :     /*
                                112                 :                :      * Deduplicate items from page, and write them to newpage.
                                113                 :                :      *
                                114                 :                :      * Copy the original page's LSN into newpage copy.  This will become the
                                115                 :                :      * updated version of the page.  We need this because XLogInsert will
                                116                 :                :      * examine the LSN and possibly dump it in a page image.
                                117                 :                :      */
                                118                 :          11769 :     newpage = PageGetTempPageCopySpecial(page);
                                119                 :          11769 :     PageSetLSN(newpage, PageGetLSN(page));
                                120                 :                : 
                                121                 :                :     /* Copy high key, if any */
                                122         [ +  + ]:          11769 :     if (!P_RIGHTMOST(opaque))
                                123                 :                :     {
                                124                 :           9454 :         ItemId      hitemid = PageGetItemId(page, P_HIKEY);
                                125                 :           9454 :         Size        hitemsz = ItemIdGetLength(hitemid);
                                126                 :           9454 :         IndexTuple  hitem = (IndexTuple) PageGetItem(page, hitemid);
                                127                 :                : 
                                128         [ -  + ]:           9454 :         if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
                                129                 :                :                         false, false) == InvalidOffsetNumber)
 1509 pg@bowt.ie                130         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add highkey");
                                131                 :                :     }
                                132                 :                : 
 1509 pg@bowt.ie                133                 :CBC       11769 :     for (offnum = minoff;
                                134         [ +  + ]:        2644425 :          offnum <= maxoff;
                                135                 :        2632656 :          offnum = OffsetNumberNext(offnum))
                                136                 :                :     {
                                137                 :        2632656 :         ItemId      itemid = PageGetItemId(page, offnum);
                                138                 :        2632656 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                139                 :                : 
                                140         [ -  + ]:        2632656 :         Assert(!ItemIdIsDead(itemid));
                                141                 :                : 
                                142         [ +  + ]:        2632656 :         if (offnum == minoff)
                                143                 :                :         {
                                144                 :                :             /*
                                145                 :                :              * No previous/base tuple for the data item -- use the data item
                                146                 :                :              * as base tuple of pending posting list
                                147                 :                :              */
                                148                 :          11769 :             _bt_dedup_start_pending(state, itup, offnum);
                                149                 :                :         }
                                150   [ +  +  +  + ]:        5240564 :         else if (state->deduplicate &&
 1478                           151         [ +  + ]:        2983384 :                  _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
 1509                           152                 :         363707 :                  _bt_dedup_save_htid(state, itup))
                                153                 :                :         {
                                154                 :                :             /*
                                155                 :                :              * Tuple is equal to base tuple of pending posting list.  Heap
                                156                 :                :              * TID(s) for itup have been saved in state.
                                157                 :                :              */
                                158                 :                :         }
                                159                 :                :         else
                                160                 :                :         {
                                161                 :                :             /*
                                162                 :                :              * Tuple is not equal to pending posting list tuple, or
                                163                 :                :              * _bt_dedup_save_htid() opted to not merge current item into
                                164                 :                :              * pending posting list for some other reason (e.g., adding more
                                165                 :                :              * TIDs would have caused posting list to exceed current
                                166                 :                :              * maxpostingsize).
                                167                 :                :              *
                                168                 :                :              * If state contains pending posting list with more than one item,
                                169                 :                :              * form new posting tuple and add it to our temp page (newpage).
                                170                 :                :              * Else add pending interval's base tuple to the temp page as-is.
                                171                 :                :              */
                                172                 :        2262964 :             pagesaving += _bt_dedup_finish_pending(newpage, state);
                                173                 :                : 
                                174         [ +  + ]:        2262964 :             if (singlevalstrat)
                                175                 :                :             {
                                176                 :                :                 /*
                                177                 :                :                  * Single value strategy's extra steps.
                                178                 :                :                  *
                                179                 :                :                  * Lower maxpostingsize for sixth and final large posting list
                                180                 :                :                  * tuple at the point where 5 maxpostingsize-capped tuples
                                181                 :                :                  * have either been formed or observed.
                                182                 :                :                  *
                                183                 :                :                  * When a sixth maxpostingsize-capped item is formed/observed,
                                184                 :                :                  * stop merging together tuples altogether.  The few tuples
                                185                 :                :                  * that remain at the end of the page won't be merged together
                                186                 :                :                  * at all (at least not until after a future page split takes
                                187                 :                :                  * place, when this page's newly allocated right sibling page
                                188                 :                :                  * gets its first deduplication pass).
                                189                 :                :                  */
 1395                           190         [ +  + ]:           2612 :                 if (state->nmaxitems == 5)
 1509                           191                 :            322 :                     _bt_singleval_fillfactor(page, state, newitemsz);
 1395                           192         [ +  + ]:           2290 :                 else if (state->nmaxitems == 6)
                                193                 :                :                 {
 1509                           194                 :            124 :                     state->deduplicate = false;
                                195                 :            124 :                     singlevalstrat = false; /* won't be back here */
                                196                 :                :                 }
                                197                 :                :             }
                                198                 :                : 
                                199                 :                :             /* itup starts new pending posting list */
                                200                 :        2262964 :             _bt_dedup_start_pending(state, itup, offnum);
                                201                 :                :         }
                                202                 :                :     }
                                203                 :                : 
                                204                 :                :     /* Handle the last item */
                                205                 :          11769 :     pagesaving += _bt_dedup_finish_pending(newpage, state);
                                206                 :                : 
                                207                 :                :     /*
                                208                 :                :      * If no items suitable for deduplication were found, newpage must be
                                209                 :                :      * exactly the same as the original page, so just return from function.
                                210                 :                :      *
                                211                 :                :      * We could determine whether or not to proceed on the basis the space
                                212                 :                :      * savings being sufficient to avoid an immediate page split instead.  We
                                213                 :                :      * don't do that because there is some small value in nbtsplitloc.c always
                                214                 :                :      * operating against a page that is fully deduplicated (apart from
                                215                 :                :      * newitem).  Besides, most of the cost has already been paid.
                                216                 :                :      */
                                217         [ +  + ]:          11769 :     if (state->nintervals == 0)
                                218                 :                :     {
                                219                 :                :         /* cannot leak memory here */
                                220                 :           2001 :         pfree(newpage);
                                221                 :           2001 :         pfree(state->htids);
                                222                 :           2001 :         pfree(state);
                                223                 :           2001 :         return;
                                224                 :                :     }
                                225                 :                : 
                                226                 :                :     /*
                                227                 :                :      * By here, it's clear that deduplication will definitely go ahead.
                                228                 :                :      *
                                229                 :                :      * Clear the BTP_HAS_GARBAGE page flag.  The index must be a heapkeyspace
                                230                 :                :      * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
                                231                 :                :      * But keep things tidy.
                                232                 :                :      */
                                233         [ -  + ]:           9768 :     if (P_HAS_GARBAGE(opaque))
                                234                 :                :     {
  744 michael@paquier.xyz       235                 :UBC           0 :         BTPageOpaque nopaque = BTPageGetOpaque(newpage);
                                236                 :                : 
 1509 pg@bowt.ie                237                 :              0 :         nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
                                238                 :                :     }
                                239                 :                : 
 1509 pg@bowt.ie                240                 :CBC        9768 :     START_CRIT_SECTION();
                                241                 :                : 
                                242                 :           9768 :     PageRestoreTempPage(newpage, page);
                                243                 :           9768 :     MarkBufferDirty(buf);
                                244                 :                : 
                                245                 :                :     /* XLOG stuff */
                                246   [ +  +  +  +  :           9768 :     if (RelationNeedsWAL(rel))
                                        +  -  +  - ]
                                247                 :                :     {
                                248                 :                :         XLogRecPtr  recptr;
                                249                 :                :         xl_btree_dedup xlrec_dedup;
                                250                 :                : 
                                251                 :           9705 :         xlrec_dedup.nintervals = state->nintervals;
                                252                 :                : 
                                253                 :           9705 :         XLogBeginInsert();
                                254                 :           9705 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
                                255                 :           9705 :         XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
                                256                 :                : 
                                257                 :                :         /*
                                258                 :                :          * The intervals array is not in the buffer, but pretend that it is.
                                259                 :                :          * When XLogInsert stores the whole buffer, the array need not be
                                260                 :                :          * stored too.
                                261                 :                :          */
                                262                 :           9705 :         XLogRegisterBufData(0, (char *) state->intervals,
                                263                 :           9705 :                             state->nintervals * sizeof(BTDedupInterval));
                                264                 :                : 
                                265                 :           9705 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
                                266                 :                : 
                                267                 :           9705 :         PageSetLSN(page, recptr);
                                268                 :                :     }
                                269                 :                : 
                                270         [ -  + ]:           9768 :     END_CRIT_SECTION();
                                271                 :                : 
                                272                 :                :     /* Local space accounting should agree with page accounting */
                                273   [ +  +  -  + ]:           9768 :     Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
                                274                 :                : 
                                275                 :                :     /* cannot leak memory here */
                                276                 :           9768 :     pfree(state->htids);
                                277                 :           9768 :     pfree(state);
                                278                 :                : }
                                279                 :                : 
                                280                 :                : /*
                                281                 :                :  * Perform bottom-up index deletion pass.
                                282                 :                :  *
                                283                 :                :  * See if duplicate index tuples (plus certain nearby tuples) are eligible to
                                284                 :                :  * be deleted via bottom-up index deletion.  The high level goal here is to
                                285                 :                :  * entirely prevent "unnecessary" page splits caused by MVCC version churn
                                286                 :                :  * from UPDATEs (when the UPDATEs don't logically modify any of the columns
                                287                 :                :  * covered by the 'rel' index).  This is qualitative, not quantitative -- we
                                288                 :                :  * do not particularly care about once-off opportunities to delete many index
                                289                 :                :  * tuples together.
                                290                 :                :  *
                                291                 :                :  * See nbtree/README for details on the design of nbtree bottom-up deletion.
                                292                 :                :  * See access/tableam.h for a description of how we're expected to cooperate
                                293                 :                :  * with the tableam.
                                294                 :                :  *
                                295                 :                :  * Returns true on success, in which case caller can assume page split will be
                                296                 :                :  * avoided for a reasonable amount of time.  Returns false when caller should
                                297                 :                :  * deduplicate the page (if possible at all).
                                298                 :                :  *
                                299                 :                :  * Note: Occasionally we return true despite failing to delete enough items to
                                300                 :                :  * avoid a split.  This makes caller skip deduplication and go split the page
                                301                 :                :  * right away.  Our return value is always just advisory information.
                                302                 :                :  *
                                303                 :                :  * Note: Caller should have already deleted all existing items with their
                                304                 :                :  * LP_DEAD bits set.
                                305                 :                :  */
                                306                 :                : bool
 1187                           307                 :           1729 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
                                308                 :                :                      Size newitemsz)
                                309                 :                : {
                                310                 :                :     OffsetNumber offnum,
                                311                 :                :                 minoff,
                                312                 :                :                 maxoff;
                                313                 :           1729 :     Page        page = BufferGetPage(buf);
  744 michael@paquier.xyz       314                 :           1729 :     BTPageOpaque opaque = BTPageGetOpaque(page);
                                315                 :                :     BTDedupState state;
                                316                 :                :     TM_IndexDeleteOp delstate;
                                317                 :                :     bool        neverdedup;
 1187 pg@bowt.ie                318                 :           1729 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                319                 :                : 
                                320                 :                :     /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
                                321                 :           1729 :     newitemsz += sizeof(ItemIdData);
                                322                 :                : 
                                323                 :                :     /* Initialize deduplication state */
                                324                 :           1729 :     state = (BTDedupState) palloc(sizeof(BTDedupStateData));
                                325                 :           1729 :     state->deduplicate = true;
                                326                 :           1729 :     state->nmaxitems = 0;
                                327                 :           1729 :     state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
                                328                 :           1729 :     state->base = NULL;
                                329                 :           1729 :     state->baseoff = InvalidOffsetNumber;
                                330                 :           1729 :     state->basetupsize = 0;
                                331                 :           1729 :     state->htids = palloc(state->maxpostingsize);
                                332                 :           1729 :     state->nhtids = 0;
                                333                 :           1729 :     state->nitems = 0;
                                334                 :           1729 :     state->phystupsize = 0;
                                335                 :           1729 :     state->nintervals = 0;
                                336                 :                : 
                                337                 :                :     /*
                                338                 :                :      * Initialize tableam state that describes bottom-up index deletion
                                339                 :                :      * operation.
                                340                 :                :      *
                                341                 :                :      * We'll go on to ask the tableam to search for TIDs whose index tuples we
                                342                 :                :      * can safely delete.  The tableam will search until our leaf page space
                                343                 :                :      * target is satisfied, or until the cost of continuing with the tableam
                                344                 :                :      * operation seems too high.  It focuses its efforts on TIDs associated
                                345                 :                :      * with duplicate index tuples that we mark "promising".
                                346                 :                :      *
                                347                 :                :      * This space target is a little arbitrary.  The tableam must be able to
                                348                 :                :      * keep the costs and benefits in balance.  We provide the tableam with
                                349                 :                :      * exhaustive information about what might work, without directly
                                350                 :                :      * concerning ourselves with avoiding work during the tableam call.  Our
                                351                 :                :      * role in costing the bottom-up deletion process is strictly advisory.
                                352                 :                :      */
  892                           353                 :           1729 :     delstate.irel = rel;
                                354                 :           1729 :     delstate.iblknum = BufferGetBlockNumber(buf);
 1187                           355                 :           1729 :     delstate.bottomup = true;
                                356                 :           1729 :     delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
                                357                 :           1729 :     delstate.ndeltids = 0;
                                358                 :           1729 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
                                359                 :           1729 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
                                360                 :                : 
                                361         [ +  + ]:           1729 :     minoff = P_FIRSTDATAKEY(opaque);
                                362                 :           1729 :     maxoff = PageGetMaxOffsetNumber(page);
                                363                 :           1729 :     for (offnum = minoff;
                                364         [ +  + ]:         470676 :          offnum <= maxoff;
                                365                 :         468947 :          offnum = OffsetNumberNext(offnum))
                                366                 :                :     {
                                367                 :         468947 :         ItemId      itemid = PageGetItemId(page, offnum);
                                368                 :         468947 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                369                 :                : 
                                370         [ -  + ]:         468947 :         Assert(!ItemIdIsDead(itemid));
                                371                 :                : 
                                372         [ +  + ]:         468947 :         if (offnum == minoff)
                                373                 :                :         {
                                374                 :                :             /* itup starts first pending interval */
                                375                 :           1729 :             _bt_dedup_start_pending(state, itup, offnum);
                                376                 :                :         }
                                377   [ +  +  +  - ]:         532900 :         else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
                                378                 :          65682 :                  _bt_dedup_save_htid(state, itup))
                                379                 :                :         {
                                380                 :                :             /* Tuple is equal; just added its TIDs to pending interval */
                                381                 :                :         }
                                382                 :                :         else
                                383                 :                :         {
                                384                 :                :             /* Finalize interval -- move its TIDs to delete state */
                                385                 :         401536 :             _bt_bottomupdel_finish_pending(page, state, &delstate);
                                386                 :                : 
                                387                 :                :             /* itup starts new pending interval */
                                388                 :         401536 :             _bt_dedup_start_pending(state, itup, offnum);
                                389                 :                :         }
                                390                 :                :     }
                                391                 :                :     /* Finalize final interval -- move its TIDs to delete state */
                                392                 :           1729 :     _bt_bottomupdel_finish_pending(page, state, &delstate);
                                393                 :                : 
                                394                 :                :     /*
                                395                 :                :      * We don't give up now in the event of having few (or even zero)
                                396                 :                :      * promising tuples for the tableam because it's not up to us as the index
                                397                 :                :      * AM to manage costs (note that the tableam might have heuristics of its
                                398                 :                :      * own that work out what to do).  We should at least avoid having our
                                399                 :                :      * caller do a useless deduplication pass after we return in the event of
                                400                 :                :      * zero promising tuples, though.
                                401                 :                :      */
                                402                 :           1729 :     neverdedup = false;
                                403         [ +  + ]:           1729 :     if (state->nintervals == 0)
                                404                 :              4 :         neverdedup = true;
                                405                 :                : 
                                406                 :           1729 :     pfree(state->htids);
                                407                 :           1729 :     pfree(state);
                                408                 :                : 
                                409                 :                :     /* Ask tableam which TIDs are deletable, then physically delete them */
                                410                 :           1729 :     _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
                                411                 :                : 
                                412                 :           1729 :     pfree(delstate.deltids);
                                413                 :           1729 :     pfree(delstate.status);
                                414                 :                : 
                                415                 :                :     /* Report "success" to caller unconditionally to avoid deduplication */
                                416         [ +  + ]:           1729 :     if (neverdedup)
                                417                 :              4 :         return true;
                                418                 :                : 
                                419                 :                :     /* Don't dedup when we won't end up back here any time soon anyway */
                                420                 :           1725 :     return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
                                421                 :                : }
                                422                 :                : 
                                423                 :                : /*
                                424                 :                :  * Create a new pending posting list tuple based on caller's base tuple.
                                425                 :                :  *
                                426                 :                :  * Every tuple processed by deduplication either becomes the base tuple for a
                                427                 :                :  * posting list, or gets its heap TID(s) accepted into a pending posting list.
                                428                 :                :  * A tuple that starts out as the base tuple for a posting list will only
                                429                 :                :  * actually be rewritten within _bt_dedup_finish_pending() when it turns out
                                430                 :                :  * that there are duplicates that can be merged into the base tuple.
                                431                 :                :  */
                                432                 :                : void
 1509                           433                 :        5421023 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
                                434                 :                :                         OffsetNumber baseoff)
                                435                 :                : {
                                436         [ -  + ]:        5421023 :     Assert(state->nhtids == 0);
                                437         [ -  + ]:        5421023 :     Assert(state->nitems == 0);
                                438         [ -  + ]:        5421023 :     Assert(!BTreeTupleIsPivot(base));
                                439                 :                : 
                                440                 :                :     /*
                                441                 :                :      * Copy heap TID(s) from new base tuple for new candidate posting list
                                442                 :                :      * into working state's array
                                443                 :                :      */
                                444         [ +  + ]:        5421023 :     if (!BTreeTupleIsPosting(base))
                                445                 :                :     {
                                446                 :        4517554 :         memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
                                447                 :        4517554 :         state->nhtids = 1;
                                448                 :        4517554 :         state->basetupsize = IndexTupleSize(base);
                                449                 :                :     }
                                450                 :                :     else
                                451                 :                :     {
                                452                 :                :         int         nposting;
                                453                 :                : 
                                454                 :         903469 :         nposting = BTreeTupleGetNPosting(base);
                                455                 :         903469 :         memcpy(state->htids, BTreeTupleGetPosting(base),
                                456                 :                :                sizeof(ItemPointerData) * nposting);
                                457                 :         903469 :         state->nhtids = nposting;
                                458                 :                :         /* basetupsize should not include existing posting list */
                                459                 :         903469 :         state->basetupsize = BTreeTupleGetPostingOffset(base);
                                460                 :                :     }
                                461                 :                : 
                                462                 :                :     /*
                                463                 :                :      * Save new base tuple itself -- it'll be needed if we actually create a
                                464                 :                :      * new posting list from new pending posting list.
                                465                 :                :      *
                                466                 :                :      * Must maintain physical size of all existing tuples (including line
                                467                 :                :      * pointer overhead) so that we can calculate space savings on page.
                                468                 :                :      */
                                469                 :        5421023 :     state->nitems = 1;
                                470                 :        5421023 :     state->base = base;
                                471                 :        5421023 :     state->baseoff = baseoff;
                                472                 :        5421023 :     state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
                                473                 :                :     /* Also save baseoff in pending state for interval */
                                474                 :        5421023 :     state->intervals[state->nintervals].baseoff = state->baseoff;
                                475                 :        5421023 : }
                                476                 :                : 
                                477                 :                : /*
                                478                 :                :  * Save itup heap TID(s) into pending posting list where possible.
                                479                 :                :  *
                                480                 :                :  * Returns bool indicating if the pending posting list managed by state now
                                481                 :                :  * includes itup's heap TID(s).
                                482                 :                :  */
                                483                 :                : bool
                                484                 :        1168393 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
                                485                 :                : {
                                486                 :                :     int         nhtids;
                                487                 :                :     ItemPointer htids;
                                488                 :                :     Size        mergedtupsz;
                                489                 :                : 
                                490         [ -  + ]:        1168393 :     Assert(!BTreeTupleIsPivot(itup));
                                491                 :                : 
                                492         [ +  + ]:        1168393 :     if (!BTreeTupleIsPosting(itup))
                                493                 :                :     {
                                494                 :        1162603 :         nhtids = 1;
                                495                 :        1162603 :         htids = &itup->t_tid;
                                496                 :                :     }
                                497                 :                :     else
                                498                 :                :     {
                                499                 :           5790 :         nhtids = BTreeTupleGetNPosting(itup);
                                500                 :           5790 :         htids = BTreeTupleGetPosting(itup);
                                501                 :                :     }
                                502                 :                : 
                                503                 :                :     /*
                                504                 :                :      * Don't append (have caller finish pending posting list as-is) if
                                505                 :                :      * appending heap TID(s) from itup would put us over maxpostingsize limit.
                                506                 :                :      *
                                507                 :                :      * This calculation needs to match the code used within _bt_form_posting()
                                508                 :                :      * for new posting list tuples.
                                509                 :                :      */
                                510                 :        1168393 :     mergedtupsz = MAXALIGN(state->basetupsize +
                                511                 :                :                            (state->nhtids + nhtids) * sizeof(ItemPointerData));
                                512                 :                : 
                                513         [ +  + ]:        1168393 :     if (mergedtupsz > state->maxpostingsize)
                                514                 :                :     {
                                515                 :                :         /*
                                516                 :                :          * Count this as an oversized item for single value strategy, though
                                517                 :                :          * only when there are 50 TIDs in the final posting list tuple.  This
                                518                 :                :          * limit (which is fairly arbitrary) avoids confusion about how many
                                519                 :                :          * 1/6 of a page tuples have been encountered/created by the current
                                520                 :                :          * deduplication pass.
                                521                 :                :          *
                                522                 :                :          * Note: We deliberately don't consider which deduplication pass
                                523                 :                :          * merged together tuples to create this item (could be a previous
                                524                 :                :          * deduplication pass, or current pass).  See _bt_do_singleval()
                                525                 :                :          * comments.
                                526                 :                :          */
 1395                           527         [ +  + ]:           9637 :         if (state->nhtids > 50)
                                528                 :           9139 :             state->nmaxitems++;
                                529                 :                : 
 1509                           530                 :           9637 :         return false;
                                531                 :                :     }
                                532                 :                : 
                                533                 :                :     /*
                                534                 :                :      * Save heap TIDs to pending posting list tuple -- itup can be merged into
                                535                 :                :      * pending posting list
                                536                 :                :      */
                                537                 :        1158756 :     state->nitems++;
                                538                 :        1158756 :     memcpy(state->htids + state->nhtids, htids,
                                539                 :                :            sizeof(ItemPointerData) * nhtids);
                                540                 :        1158756 :     state->nhtids += nhtids;
                                541                 :        1158756 :     state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
                                542                 :                : 
                                543                 :        1158756 :     return true;
                                544                 :                : }
                                545                 :                : 
                                546                 :                : /*
                                547                 :                :  * Finalize pending posting list tuple, and add it to the page.  Final tuple
                                548                 :                :  * is based on saved base tuple, and saved list of heap TIDs.
                                549                 :                :  *
                                550                 :                :  * Returns space saving from deduplicating to make a new posting list tuple.
                                551                 :                :  * Note that this includes line pointer overhead.  This is zero in the case
                                552                 :                :  * where no deduplication was possible.
                                553                 :                :  */
                                554                 :                : Size
                                555                 :        2678849 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
                                556                 :                : {
                                557                 :                :     OffsetNumber tupoff;
                                558                 :                :     Size        tuplesz;
                                559                 :                :     Size        spacesaving;
                                560                 :                : 
                                561         [ -  + ]:        2678849 :     Assert(state->nitems > 0);
                                562         [ -  + ]:        2678849 :     Assert(state->nitems <= state->nhtids);
                                563         [ -  + ]:        2678849 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
                                564                 :                : 
                                565                 :        2678849 :     tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
                                566         [ +  + ]:        2678849 :     if (state->nitems == 1)
                                567                 :                :     {
                                568                 :                :         /* Use original, unchanged base tuple */
                                569                 :        2497688 :         tuplesz = IndexTupleSize(state->base);
  618                           570         [ -  + ]:        2497688 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
                                571         [ -  + ]:        2497688 :         Assert(tuplesz <= BTMaxItemSize(newpage));
 1509                           572         [ -  + ]:        2497688 :         if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
                                573                 :                :                         false, false) == InvalidOffsetNumber)
 1509 pg@bowt.ie                574         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
                                575                 :                : 
 1509 pg@bowt.ie                576                 :CBC     2497688 :         spacesaving = 0;
                                577                 :                :     }
                                578                 :                :     else
                                579                 :                :     {
                                580                 :                :         IndexTuple  final;
                                581                 :                : 
                                582                 :                :         /* Form a tuple with a posting list */
                                583                 :         181161 :         final = _bt_form_posting(state->base, state->htids, state->nhtids);
                                584                 :         181161 :         tuplesz = IndexTupleSize(final);
                                585         [ -  + ]:         181161 :         Assert(tuplesz <= state->maxpostingsize);
                                586                 :                : 
                                587                 :                :         /* Save final number of items for posting list */
                                588                 :         181161 :         state->intervals[state->nintervals].nitems = state->nitems;
                                589                 :                : 
                                590         [ -  + ]:         181161 :         Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
  618                           591         [ -  + ]:         181161 :         Assert(tuplesz <= BTMaxItemSize(newpage));
 1509                           592         [ -  + ]:         181161 :         if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
                                593                 :                :                         false) == InvalidOffsetNumber)
 1509 pg@bowt.ie                594         [ #  # ]:UBC           0 :             elog(ERROR, "deduplication failed to add tuple to page");
                                595                 :                : 
 1509 pg@bowt.ie                596                 :CBC      181161 :         pfree(final);
                                597                 :         181161 :         spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
                                598                 :                :         /* Increment nintervals, since we wrote a new posting list tuple */
                                599                 :         181161 :         state->nintervals++;
                                600   [ +  -  -  + ]:         181161 :         Assert(spacesaving > 0 && spacesaving < BLCKSZ);
                                601                 :                :     }
                                602                 :                : 
                                603                 :                :     /* Reset state for next pending posting list */
                                604                 :        2678849 :     state->nhtids = 0;
                                605                 :        2678849 :     state->nitems = 0;
                                606                 :        2678849 :     state->phystupsize = 0;
                                607                 :                : 
                                608                 :        2678849 :     return spacesaving;
                                609                 :                : }
                                610                 :                : 
                                611                 :                : /*
                                612                 :                :  * Finalize interval during bottom-up index deletion.
                                613                 :                :  *
                                614                 :                :  * During a bottom-up pass we expect that TIDs will be recorded in dedup state
                                615                 :                :  * first, and then get moved over to delstate (in variable-sized batches) by
                                616                 :                :  * calling here.  Call here happens when the number of TIDs in a dedup
                                617                 :                :  * interval is known, and interval gets finalized (i.e. when caller sees next
                                618                 :                :  * tuple on the page is not a duplicate, or when caller runs out of tuples to
                                619                 :                :  * process from leaf page).
                                620                 :                :  *
                                621                 :                :  * This is where bottom-up deletion determines and remembers which entries are
                                622                 :                :  * duplicates.  This will be important information to the tableam delete
                                623                 :                :  * infrastructure later on.  Plain index tuple duplicates are marked
                                624                 :                :  * "promising" here, per tableam contract.
                                625                 :                :  *
                                626                 :                :  * Our approach to marking entries whose TIDs come from posting lists is more
                                627                 :                :  * complicated.  Posting lists can only be formed by a deduplication pass (or
                                628                 :                :  * during an index build), so recent version churn affecting the pointed-to
                                629                 :                :  * logical rows is not particularly likely.  We may still give a weak signal
                                630                 :                :  * about posting list tuples' entries (by marking just one of its TIDs/entries
                                631                 :                :  * promising), though this is only a possibility in the event of further
                                632                 :                :  * duplicate index tuples in final interval that covers posting list tuple (as
                                633                 :                :  * in the plain tuple case).  A weak signal/hint will be useful to the tableam
                                634                 :                :  * when it has no stronger signal to go with for the deletion operation as a
                                635                 :                :  * whole.
                                636                 :                :  *
                                637                 :                :  * The heuristics we use work well in practice because we only need to give
                                638                 :                :  * the tableam the right _general_ idea about where to look.  Garbage tends to
                                639                 :                :  * naturally get concentrated in relatively few table blocks with workloads
                                640                 :                :  * that bottom-up deletion targets.  The tableam cannot possibly rank all
                                641                 :                :  * available table blocks sensibly based on the hints we provide, but that's
                                642                 :                :  * okay -- only the extremes matter.  The tableam just needs to be able to
                                643                 :                :  * predict which few table blocks will have the most tuples that are safe to
                                644                 :                :  * delete for each deletion operation, with low variance across related
                                645                 :                :  * deletion operations.
                                646                 :                :  */
                                647                 :                : static void
 1187                           648                 :         403265 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
                                649                 :                :                                TM_IndexDeleteOp *delstate)
                                650                 :                : {
                                651                 :         403265 :     bool        dupinterval = (state->nitems > 1);
                                652                 :                : 
                                653         [ -  + ]:         403265 :     Assert(state->nitems > 0);
                                654         [ -  + ]:         403265 :     Assert(state->nitems <= state->nhtids);
                                655         [ -  + ]:         403265 :     Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
                                656                 :                : 
                                657         [ +  + ]:         872212 :     for (int i = 0; i < state->nitems; i++)
                                658                 :                :     {
                                659                 :         468947 :         OffsetNumber offnum = state->baseoff + i;
                                660                 :         468947 :         ItemId      itemid = PageGetItemId(page, offnum);
                                661                 :         468947 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
                                662                 :         468947 :         TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
                                663                 :         468947 :         TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
                                664                 :                : 
                                665         [ +  + ]:         468947 :         if (!BTreeTupleIsPosting(itup))
                                666                 :                :         {
                                667                 :                :             /* Simple case: A plain non-pivot tuple */
                                668                 :         364277 :             ideltid->tid = itup->t_tid;
                                669                 :         364277 :             ideltid->id = delstate->ndeltids;
                                670                 :         364277 :             istatus->idxoffnum = offnum;
                                671                 :         364277 :             istatus->knowndeletable = false; /* for now */
                                672                 :         364277 :             istatus->promising = dupinterval;    /* simple rule */
                                673                 :         364277 :             istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
                                674                 :                : 
                                675                 :         364277 :             delstate->ndeltids++;
                                676                 :                :         }
                                677                 :                :         else
                                678                 :                :         {
                                679                 :                :             /*
                                680                 :                :              * Complicated case: A posting list tuple.
                                681                 :                :              *
                                682                 :                :              * We make the conservative assumption that there can only be at
                                683                 :                :              * most one affected logical row per posting list tuple.  There
                                684                 :                :              * will be at most one promising entry in deltids to represent
                                685                 :                :              * this presumed lone logical row.  Note that this isn't even
                                686                 :                :              * considered unless the posting list tuple is also in an interval
                                687                 :                :              * of duplicates -- this complicated rule is just a variant of the
                                688                 :                :              * simple rule used to decide if plain index tuples are promising.
                                689                 :                :              */
                                690                 :         104670 :             int         nitem = BTreeTupleGetNPosting(itup);
                                691                 :         104670 :             bool        firstpromising = false;
                                692                 :         104670 :             bool        lastpromising = false;
                                693                 :                : 
                                694         [ -  + ]:         104670 :             Assert(_bt_posting_valid(itup));
                                695                 :                : 
                                696         [ +  + ]:         104670 :             if (dupinterval)
                                697                 :                :             {
                                698                 :                :                 /*
                                699                 :                :                  * Complicated rule: either the first or last TID in the
                                700                 :                :                  * posting list gets marked promising (if any at all)
                                701                 :                :                  */
                                702                 :                :                 BlockNumber minblocklist,
                                703                 :                :                             midblocklist,
                                704                 :                :                             maxblocklist;
                                705                 :                :                 ItemPointer mintid,
                                706                 :                :                             midtid,
                                707                 :                :                             maxtid;
                                708                 :                : 
                                709                 :           9539 :                 mintid = BTreeTupleGetHeapTID(itup);
                                710                 :           9539 :                 midtid = BTreeTupleGetPostingN(itup, nitem / 2);
                                711                 :           9539 :                 maxtid = BTreeTupleGetMaxHeapTID(itup);
                                712                 :           9539 :                 minblocklist = ItemPointerGetBlockNumber(mintid);
                                713                 :           9539 :                 midblocklist = ItemPointerGetBlockNumber(midtid);
                                714                 :           9539 :                 maxblocklist = ItemPointerGetBlockNumber(maxtid);
                                715                 :                : 
                                716                 :                :                 /* Only entry with predominant table block can be promising */
                                717                 :           9539 :                 firstpromising = (minblocklist == midblocklist);
                                718   [ +  +  +  + ]:           9539 :                 lastpromising = (!firstpromising &&
                                719                 :                :                                  midblocklist == maxblocklist);
                                720                 :                :             }
                                721                 :                : 
                                722         [ +  + ]:         607708 :             for (int p = 0; p < nitem; p++)
                                723                 :                :             {
                                724                 :         503038 :                 ItemPointer htid = BTreeTupleGetPostingN(itup, p);
                                725                 :                : 
                                726                 :         503038 :                 ideltid->tid = *htid;
                                727                 :         503038 :                 ideltid->id = delstate->ndeltids;
                                728                 :         503038 :                 istatus->idxoffnum = offnum;
                                729                 :         503038 :                 istatus->knowndeletable = false; /* for now */
                                730                 :         503038 :                 istatus->promising = false;
                                731   [ +  +  +  +  :         503038 :                 if ((firstpromising && p == 0) ||
                                              +  + ]
                                732         [ +  + ]:          63094 :                     (lastpromising && p == nitem - 1))
                                733                 :           6178 :                     istatus->promising = true;
                                734                 :         503038 :                 istatus->freespace = sizeof(ItemPointerData);    /* at worst */
                                735                 :                : 
                                736                 :         503038 :                 ideltid++;
                                737                 :         503038 :                 istatus++;
                                738                 :         503038 :                 delstate->ndeltids++;
                                739                 :                :             }
                                740                 :                :         }
                                741                 :                :     }
                                742                 :                : 
                                743         [ +  + ]:         403265 :     if (dupinterval)
                                744                 :                :     {
                                745                 :          43741 :         state->intervals[state->nintervals].nitems = state->nitems;
                                746                 :          43741 :         state->nintervals++;
                                747                 :                :     }
                                748                 :                : 
                                749                 :                :     /* Reset state for next interval */
                                750                 :         403265 :     state->nhtids = 0;
                                751                 :         403265 :     state->nitems = 0;
                                752                 :         403265 :     state->phystupsize = 0;
                                753                 :         403265 : }
                                754                 :                : 
                                755                 :                : /*
                                756                 :                :  * Determine if page non-pivot tuples (data items) are all duplicates of the
                                757                 :                :  * same value -- if they are, deduplication's "single value" strategy should
                                758                 :                :  * be applied.  The general goal of this strategy is to ensure that
                                759                 :                :  * nbtsplitloc.c (which uses its own single value strategy) will find a useful
                                760                 :                :  * split point as further duplicates are inserted, and successive rightmost
                                761                 :                :  * page splits occur among pages that store the same duplicate value.  When
                                762                 :                :  * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
                                763                 :                :  * just like it would if deduplication were disabled.
                                764                 :                :  *
                                765                 :                :  * We expect that affected workloads will require _several_ single value
                                766                 :                :  * strategy deduplication passes (over a page that only stores duplicates)
                                767                 :                :  * before the page is finally split.  The first deduplication pass should only
                                768                 :                :  * find regular non-pivot tuples.  Later deduplication passes will find
                                769                 :                :  * existing maxpostingsize-capped posting list tuples, which must be skipped
                                770                 :                :  * over.  The penultimate pass is generally the first pass that actually
                                771                 :                :  * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
                                772                 :                :  * few untouched non-pivot tuples.  The final deduplication pass won't free
                                773                 :                :  * any space -- it will skip over everything without merging anything (it
                                774                 :                :  * retraces the steps of the penultimate pass).
                                775                 :                :  *
                                776                 :                :  * Fortunately, having several passes isn't too expensive.  Each pass (after
                                777                 :                :  * the first pass) won't spend many cycles on the large posting list tuples
                                778                 :                :  * left by previous passes.  Each pass will find a large contiguous group of
                                779                 :                :  * smaller duplicate tuples to merge together at the end of the page.
                                780                 :                :  */
                                781                 :                : static bool
 1509                           782                 :          10208 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
                                783                 :                :                  OffsetNumber minoff, IndexTuple newitem)
                                784                 :                : {
 1478                           785                 :          10208 :     int         nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
                                786                 :                :     ItemId      itemid;
                                787                 :                :     IndexTuple  itup;
                                788                 :                : 
 1509                           789                 :          10208 :     itemid = PageGetItemId(page, minoff);
                                790                 :          10208 :     itup = (IndexTuple) PageGetItem(page, itemid);
                                791                 :                : 
 1478                           792         [ +  + ]:          10208 :     if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
                                793                 :                :     {
 1509                           794                 :            994 :         itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
                                795                 :            994 :         itup = (IndexTuple) PageGetItem(page, itemid);
                                796                 :                : 
 1478                           797         [ +  + ]:            994 :         if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
 1509                           798                 :            645 :             return true;
                                799                 :                :     }
                                800                 :                : 
                                801                 :           9563 :     return false;
                                802                 :                : }
                                803                 :                : 
                                804                 :                : /*
                                805                 :                :  * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
                                806                 :                :  * and final maxpostingsize-capped tuple.  The sixth and final posting list
                                807                 :                :  * tuple will end up somewhat smaller than the first five.  (Note: The first
                                808                 :                :  * five tuples could actually just be very large duplicate tuples that
                                809                 :                :  * couldn't be merged together at all.  Deduplication will simply not modify
                                810                 :                :  * the page when that happens.)
                                811                 :                :  *
                                812                 :                :  * When there are six posting lists on the page (after current deduplication
                                813                 :                :  * pass goes on to create/observe a sixth very large tuple), caller should end
                                814                 :                :  * its deduplication pass.  It isn't useful to try to deduplicate items that
                                815                 :                :  * are supposed to end up on the new right sibling page following the
                                816                 :                :  * anticipated page split.  A future deduplication pass of future right
                                817                 :                :  * sibling page might take care of it.  (This is why the first single value
                                818                 :                :  * strategy deduplication pass for a given leaf page will generally find only
                                819                 :                :  * plain non-pivot tuples -- see _bt_do_singleval() comments.)
                                820                 :                :  */
                                821                 :                : static void
                                822                 :            322 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
                                823                 :                : {
                                824                 :                :     Size        leftfree;
                                825                 :                :     int         reduction;
                                826                 :                : 
                                827                 :                :     /* This calculation needs to match nbtsplitloc.c */
                                828                 :            322 :     leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
                                829                 :                :         MAXALIGN(sizeof(BTPageOpaqueData));
                                830                 :                :     /* Subtract size of new high key (includes pivot heap TID space) */
                                831                 :            322 :     leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
                                832                 :                : 
                                833                 :                :     /*
                                834                 :                :      * Reduce maxpostingsize by an amount equal to target free space on left
                                835                 :                :      * half of page
                                836                 :                :      */
                                837                 :            322 :     reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
                                838         [ +  - ]:            322 :     if (state->maxpostingsize > reduction)
                                839                 :            322 :         state->maxpostingsize -= reduction;
                                840                 :                :     else
 1509 pg@bowt.ie                841                 :UBC           0 :         state->maxpostingsize = 0;
 1509 pg@bowt.ie                842                 :CBC         322 : }
                                843                 :                : 
                                844                 :                : /*
                                845                 :                :  * Build a posting list tuple based on caller's "base" index tuple and list of
                                846                 :                :  * heap TIDs.  When nhtids == 1, builds a standard non-pivot tuple without a
                                847                 :                :  * posting list. (Posting list tuples can never have a single heap TID, partly
                                848                 :                :  * because that ensures that deduplication always reduces final MAXALIGN()'d
                                849                 :                :  * size of entire tuple.)
                                850                 :                :  *
                                851                 :                :  * Convention is that posting list starts at a MAXALIGN()'d offset (rather
                                852                 :                :  * than a SHORTALIGN()'d offset), in line with the approach taken when
                                853                 :                :  * appending a heap TID to new pivot tuple/high key during suffix truncation.
                                854                 :                :  * This sometimes wastes a little space that was only needed as alignment
                                855                 :                :  * padding in the original tuple.  Following this convention simplifies the
                                856                 :                :  * space accounting used when deduplicating a page (the same convention
                                857                 :                :  * simplifies the accounting for choosing a point to split a page at).
                                858                 :                :  *
                                859                 :                :  * Note: Caller's "htids" array must be unique and already in ascending TID
                                860                 :                :  * order.  Any existing heap TIDs from "base" won't automatically appear in
                                861                 :                :  * returned posting list tuple (they must be included in htids array.)
                                862                 :                :  */
                                863                 :                : IndexTuple
                                864                 :         230084 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
                                865                 :                : {
                                866                 :                :     uint32      keysize,
                                867                 :                :                 newsize;
                                868                 :                :     IndexTuple  itup;
                                869                 :                : 
                                870         [ +  + ]:         230084 :     if (BTreeTupleIsPosting(base))
                                871                 :          70602 :         keysize = BTreeTupleGetPostingOffset(base);
                                872                 :                :     else
                                873                 :         159482 :         keysize = IndexTupleSize(base);
                                874                 :                : 
                                875         [ -  + ]:         230084 :     Assert(!BTreeTupleIsPivot(base));
                                876   [ +  -  -  + ]:         230084 :     Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
                                877         [ -  + ]:         230084 :     Assert(keysize == MAXALIGN(keysize));
                                878                 :                : 
                                879                 :                :     /* Determine final size of new tuple */
                                880         [ +  + ]:         230084 :     if (nhtids > 1)
                                881                 :         199538 :         newsize = MAXALIGN(keysize +
                                882                 :                :                            nhtids * sizeof(ItemPointerData));
                                883                 :                :     else
                                884                 :          30546 :         newsize = keysize;
                                885                 :                : 
                                886         [ -  + ]:         230084 :     Assert(newsize <= INDEX_SIZE_MASK);
                                887         [ -  + ]:         230084 :     Assert(newsize == MAXALIGN(newsize));
                                888                 :                : 
                                889                 :                :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
                                890                 :         230084 :     itup = palloc0(newsize);
                                891                 :         230084 :     memcpy(itup, base, keysize);
                                892                 :         230084 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                893                 :         230084 :     itup->t_info |= newsize;
                                894         [ +  + ]:         230084 :     if (nhtids > 1)
                                895                 :                :     {
                                896                 :                :         /* Form posting list tuple */
                                897                 :         199538 :         BTreeTupleSetPosting(itup, nhtids, keysize);
                                898                 :         199538 :         memcpy(BTreeTupleGetPosting(itup), htids,
                                899                 :                :                sizeof(ItemPointerData) * nhtids);
                                900         [ -  + ]:         199538 :         Assert(_bt_posting_valid(itup));
                                901                 :                :     }
                                902                 :                :     else
                                903                 :                :     {
                                904                 :                :         /* Form standard non-pivot tuple */
                                905                 :          30546 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                906                 :          30546 :         ItemPointerCopy(htids, &itup->t_tid);
                                907         [ -  + ]:          30546 :         Assert(ItemPointerIsValid(&itup->t_tid));
                                908                 :                :     }
                                909                 :                : 
                                910                 :         230084 :     return itup;
                                911                 :                : }
                                912                 :                : 
                                913                 :                : /*
                                914                 :                :  * Generate a replacement tuple by "updating" a posting list tuple so that it
                                915                 :                :  * no longer has TIDs that need to be deleted.
                                916                 :                :  *
                                917                 :                :  * Used by both VACUUM and index deletion.  Caller's vacposting argument
                                918                 :                :  * points to the existing posting list tuple to be updated.
                                919                 :                :  *
                                920                 :                :  * On return, caller's vacposting argument will point to final "updated"
                                921                 :                :  * tuple, which will be palloc()'d in caller's memory context.
                                922                 :                :  */
                                923                 :                : void
                                924                 :          27813 : _bt_update_posting(BTVacuumPosting vacposting)
                                925                 :                : {
                                926                 :          27813 :     IndexTuple  origtuple = vacposting->itup;
                                927                 :                :     uint32      keysize,
                                928                 :                :                 newsize;
                                929                 :                :     IndexTuple  itup;
                                930                 :                :     int         nhtids;
                                931                 :                :     int         ui,
                                932                 :                :                 d;
                                933                 :                :     ItemPointer htids;
                                934                 :                : 
                                935                 :          27813 :     nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
                                936                 :                : 
                                937         [ -  + ]:          27813 :     Assert(_bt_posting_valid(origtuple));
                                938   [ +  -  -  + ]:          27813 :     Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
                                939                 :                : 
                                940                 :                :     /*
                                941                 :                :      * Determine final size of new tuple.
                                942                 :                :      *
                                943                 :                :      * This calculation needs to match the code used within _bt_form_posting()
                                944                 :                :      * for new posting list tuples.  We avoid calling _bt_form_posting() here
                                945                 :                :      * to save ourselves a second memory allocation for a htids workspace.
                                946                 :                :      */
 1505                           947                 :          27813 :     keysize = BTreeTupleGetPostingOffset(origtuple);
 1509                           948         [ +  + ]:          27813 :     if (nhtids > 1)
                                949                 :           6002 :         newsize = MAXALIGN(keysize +
                                950                 :                :                            nhtids * sizeof(ItemPointerData));
                                951                 :                :     else
                                952                 :          21811 :         newsize = keysize;
                                953                 :                : 
 1504                           954         [ -  + ]:          27813 :     Assert(newsize <= INDEX_SIZE_MASK);
                                955         [ -  + ]:          27813 :     Assert(newsize == MAXALIGN(newsize));
                                956                 :                : 
                                957                 :                :     /* Allocate memory using palloc0() (matches index_form_tuple()) */
 1509                           958                 :          27813 :     itup = palloc0(newsize);
                                959                 :          27813 :     memcpy(itup, origtuple, keysize);
                                960                 :          27813 :     itup->t_info &= ~INDEX_SIZE_MASK;
                                961                 :          27813 :     itup->t_info |= newsize;
                                962                 :                : 
                                963         [ +  + ]:          27813 :     if (nhtids > 1)
                                964                 :                :     {
                                965                 :                :         /* Form posting list tuple */
                                966                 :           6002 :         BTreeTupleSetPosting(itup, nhtids, keysize);
                                967                 :           6002 :         htids = BTreeTupleGetPosting(itup);
                                968                 :                :     }
                                969                 :                :     else
                                970                 :                :     {
                                971                 :                :         /* Form standard non-pivot tuple */
                                972                 :          21811 :         itup->t_info &= ~INDEX_ALT_TID_MASK;
                                973                 :          21811 :         htids = &itup->t_tid;
                                974                 :                :     }
                                975                 :                : 
                                976                 :          27813 :     ui = 0;
                                977                 :          27813 :     d = 0;
                                978         [ +  + ]:         151491 :     for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
                                979                 :                :     {
                                980   [ +  +  +  + ]:         123678 :         if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
                                981                 :                :         {
                                982                 :          61983 :             d++;
                                983                 :          61983 :             continue;
                                984                 :                :         }
                                985                 :          61695 :         htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
                                986                 :                :     }
                                987         [ -  + ]:          27813 :     Assert(ui == nhtids);
                                988         [ -  + ]:          27813 :     Assert(d == vacposting->ndeletedtids);
                                989   [ +  +  -  + ]:          27813 :     Assert(nhtids == 1 || _bt_posting_valid(itup));
 1504                           990   [ +  +  -  + ]:          27813 :     Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
                                991                 :                : 
                                992                 :                :     /* vacposting arg's itup will now point to updated version */
 1509                           993                 :          27813 :     vacposting->itup = itup;
                                994                 :          27813 : }
                                995                 :                : 
                                996                 :                : /*
                                997                 :                :  * Prepare for a posting list split by swapping heap TID in newitem with heap
                                998                 :                :  * TID from original posting list (the 'oposting' heap TID located at offset
                                999                 :                :  * 'postingoff').  Modifies newitem, so caller should pass their own private
                               1000                 :                :  * copy that can safely be modified.
                               1001                 :                :  *
                               1002                 :                :  * Returns new posting list tuple, which is palloc()'d in caller's context.
                               1003                 :                :  * This is guaranteed to be the same size as 'oposting'.  Modified newitem is
                               1004                 :                :  * what caller actually inserts. (This happens inside the same critical
                               1005                 :                :  * section that performs an in-place update of old posting list using new
                               1006                 :                :  * posting list returned here.)
                               1007                 :                :  *
                               1008                 :                :  * While the keys from newitem and oposting must be opclass equal, and must
                               1009                 :                :  * generate identical output when run through the underlying type's output
                               1010                 :                :  * function, it doesn't follow that their representations match exactly.
                               1011                 :                :  * Caller must avoid assuming that there can't be representational differences
                               1012                 :                :  * that make datums from oposting bigger or smaller than the corresponding
                               1013                 :                :  * datums from newitem.  For example, differences in TOAST input state might
                               1014                 :                :  * break a faulty assumption about tuple size (the executor is entitled to
                               1015                 :                :  * apply TOAST compression based on its own criteria).  It also seems possible
                               1016                 :                :  * that further representational variation will be introduced in the future,
                               1017                 :                :  * in order to support nbtree features like page-level prefix compression.
                               1018                 :                :  *
                               1019                 :                :  * See nbtree/README for details on the design of posting list splits.
                               1020                 :                :  */
                               1021                 :                : IndexTuple
                               1022                 :          15760 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
                               1023                 :                : {
                               1024                 :                :     int         nhtids;
                               1025                 :                :     char       *replacepos;
                               1026                 :                :     char       *replaceposright;
                               1027                 :                :     Size        nmovebytes;
                               1028                 :                :     IndexTuple  nposting;
                               1029                 :                : 
                               1030                 :          15760 :     nhtids = BTreeTupleGetNPosting(oposting);
                               1031         [ -  + ]:          15760 :     Assert(_bt_posting_valid(oposting));
                               1032                 :                : 
                               1033                 :                :     /*
                               1034                 :                :      * The postingoff argument originated as a _bt_binsrch_posting() return
                               1035                 :                :      * value.  It will be 0 in the event of corruption that makes a leaf page
                               1036                 :                :      * contain a non-pivot tuple that's somehow identical to newitem (no two
                               1037                 :                :      * non-pivot tuples should ever have the same TID).  This has been known
                               1038                 :                :      * to happen in the field from time to time.
                               1039                 :                :      *
                               1040                 :                :      * Perform a basic sanity check to catch this case now.
                               1041                 :                :      */
 1066                          1042   [ +  -  -  + ]:          15760 :     if (!(postingoff > 0 && postingoff < nhtids))
 1066 pg@bowt.ie               1043         [ #  # ]:UBC           0 :         elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
                               1044                 :                :              nhtids, postingoff);
                               1045                 :                : 
                               1046                 :                :     /*
                               1047                 :                :      * Move item pointers in posting list to make a gap for the new item's
                               1048                 :                :      * heap TID.  We shift TIDs one place to the right, losing original
                               1049                 :                :      * rightmost TID. (nmovebytes must not include TIDs to the left of
                               1050                 :                :      * postingoff, nor the existing rightmost/max TID that gets overwritten.)
                               1051                 :                :      */
 1509 pg@bowt.ie               1052                 :CBC       15760 :     nposting = CopyIndexTuple(oposting);
                               1053                 :          15760 :     replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
                               1054                 :          15760 :     replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
                               1055                 :          15760 :     nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
                               1056                 :          15760 :     memmove(replaceposright, replacepos, nmovebytes);
                               1057                 :                : 
                               1058                 :                :     /* Fill the gap at postingoff with TID of new item (original new TID) */
                               1059   [ +  -  -  + ]:          15760 :     Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
                               1060                 :          15760 :     ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
                               1061                 :                : 
                               1062                 :                :     /* Now copy oposting's rightmost/max TID into new item (final new TID) */
                               1063                 :          15760 :     ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
                               1064                 :                : 
                               1065         [ -  + ]:          15760 :     Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
                               1066                 :                :                               BTreeTupleGetHeapTID(newitem)) < 0);
                               1067         [ -  + ]:          15760 :     Assert(_bt_posting_valid(nposting));
                               1068                 :                : 
                               1069                 :          15760 :     return nposting;
                               1070                 :                : }
                               1071                 :                : 
                               1072                 :                : /*
                               1073                 :                :  * Verify posting list invariants for "posting", which must be a posting list
                               1074                 :                :  * tuple.  Used within assertions.
                               1075                 :                :  */
                               1076                 :                : #ifdef USE_ASSERT_CHECKING
                               1077                 :                : static bool
                               1078                 :         369543 : _bt_posting_valid(IndexTuple posting)
                               1079                 :                : {
                               1080                 :                :     ItemPointerData last;
                               1081                 :                :     ItemPointer htid;
                               1082                 :                : 
                               1083   [ +  -  -  + ]:         369543 :     if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
 1509 pg@bowt.ie               1084                 :UBC           0 :         return false;
                               1085                 :                : 
                               1086                 :                :     /* Remember first heap TID for loop */
 1509 pg@bowt.ie               1087                 :CBC      369543 :     ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
                               1088         [ -  + ]:         369543 :     if (!ItemPointerIsValid(&last))
 1509 pg@bowt.ie               1089                 :UBC           0 :         return false;
                               1090                 :                : 
                               1091                 :                :     /* Iterate, starting from second TID */
 1509 pg@bowt.ie               1092         [ +  + ]:CBC     7506025 :     for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
                               1093                 :                :     {
                               1094                 :        7136482 :         htid = BTreeTupleGetPostingN(posting, i);
                               1095                 :                : 
                               1096         [ -  + ]:        7136482 :         if (!ItemPointerIsValid(htid))
 1509 pg@bowt.ie               1097                 :UBC           0 :             return false;
 1509 pg@bowt.ie               1098         [ -  + ]:CBC     7136482 :         if (ItemPointerCompare(htid, &last) <= 0)
 1509 pg@bowt.ie               1099                 :UBC           0 :             return false;
 1509 pg@bowt.ie               1100                 :CBC     7136482 :         ItemPointerCopy(htid, &last);
                               1101                 :                :     }
                               1102                 :                : 
                               1103                 :         369543 :     return true;
                               1104                 :                : }
                               1105                 :                : #endif
        

Generated by: LCOV version 2.1-beta2-3-g6141622