LCOV - differential code coverage report
Current view: top level - src/backend/access/heap - pruneheap.c (source / functions) Coverage Total Hit LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 92.4 % 331 306 10 9 6 1 130 5 170 18 120 6
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 11 11 9 1 1 9
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * pruneheap.c
       4                 :  *    heap page pruning and HOT-chain management code
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/heap/pruneheap.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : #include "postgres.h"
      16                 : 
      17                 : #include "access/heapam.h"
      18                 : #include "access/heapam_xlog.h"
      19                 : #include "access/htup_details.h"
      20                 : #include "access/transam.h"
      21                 : #include "access/xlog.h"
      22                 : #include "access/xloginsert.h"
      23                 : #include "catalog/catalog.h"
      24                 : #include "miscadmin.h"
      25                 : #include "pgstat.h"
      26                 : #include "storage/bufmgr.h"
      27                 : #include "utils/snapmgr.h"
      28                 : #include "utils/rel.h"
      29                 : #include "utils/snapmgr.h"
      30                 : 
      31                 : /* Working data for heap_page_prune and subroutines */
      32                 : typedef struct
      33                 : {
      34                 :     Relation    rel;
      35                 : 
      36                 :     /* tuple visibility test, initialized for the relation */
      37                 :     GlobalVisState *vistest;
      38                 : 
      39                 :     /*
      40                 :      * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
      41                 :      * been computed (done on demand, and only if
      42                 :      * OldSnapshotThresholdActive()). The first time a tuple is about to be
      43                 :      * removed based on the limited horizon, old_snap_used is set to true, and
      44                 :      * SetOldSnapshotThresholdTimestamp() is called. See
      45                 :      * heap_prune_satisfies_vacuum().
      46                 :      */
      47                 :     TimestampTz old_snap_ts;
      48                 :     TransactionId old_snap_xmin;
      49                 :     bool        old_snap_used;
      50                 : 
      51                 :     TransactionId new_prune_xid;    /* new prune hint value for page */
      52                 :     TransactionId snapshotConflictHorizon;  /* latest xid removed */
      53                 :     int         nredirected;    /* numbers of entries in arrays below */
      54                 :     int         ndead;
      55                 :     int         nunused;
      56                 :     /* arrays that accumulate indexes of items to be changed */
      57                 :     OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
      58                 :     OffsetNumber nowdead[MaxHeapTuplesPerPage];
      59                 :     OffsetNumber nowunused[MaxHeapTuplesPerPage];
      60                 : 
      61                 :     /*
      62                 :      * marked[i] is true if item i is entered in one of the above arrays.
      63                 :      *
      64                 :      * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
      65                 :      * 1. Otherwise every access would need to subtract 1.
      66                 :      */
      67                 :     bool        marked[MaxHeapTuplesPerPage + 1];
      68                 : 
      69                 :     /*
      70                 :      * Tuple visibility is only computed once for each tuple, for correctness
      71                 :      * and efficiency reasons; see comment in heap_page_prune() for details.
      72                 :      * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
      73                 :      * indicate no visibility has been computed, e.g. for LP_DEAD items.
      74                 :      *
      75                 :      * Same indexing as ->marked.
      76                 :      */
      77                 :     int8        htsv[MaxHeapTuplesPerPage + 1];
      78                 : } PruneState;
      79                 : 
      80                 : /* Local functions */
      81                 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
      82                 :                                                HeapTuple tup,
      83                 :                                                Buffer buffer);
      84                 : static int  heap_prune_chain(Buffer buffer,
      85                 :                              OffsetNumber rootoffnum,
      86                 :                              PruneState *prstate);
      87                 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
      88                 : static void heap_prune_record_redirect(PruneState *prstate,
      89                 :                                        OffsetNumber offnum, OffsetNumber rdoffnum);
      90                 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
      91                 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
      92                 : static void page_verify_redirects(Page page);
      93                 : 
      94                 : 
      95                 : /*
      96                 :  * Optionally prune and repair fragmentation in the specified page.
      97                 :  *
      98                 :  * This is an opportunistic function.  It will perform housekeeping
      99                 :  * only if the page heuristically looks like a candidate for pruning and we
     100                 :  * can acquire buffer cleanup lock without blocking.
     101                 :  *
     102                 :  * Note: this is called quite often.  It's important that it fall out quickly
     103                 :  * if there's not any use in pruning.
     104                 :  *
     105                 :  * Caller must have pin on the buffer, and must *not* have a lock on it.
     106                 :  */
     107                 : void
     108 CBC    17752776 : heap_page_prune_opt(Relation relation, Buffer buffer)
     109                 : {
     110        17752776 :     Page        page = BufferGetPage(buffer);
     111                 :     TransactionId prune_xid;
     112                 :     GlobalVisState *vistest;
     113        17752776 :     TransactionId limited_xmin = InvalidTransactionId;
     114        17752776 :     TimestampTz limited_ts = 0;
     115                 :     Size        minfree;
     116                 : 
     117                 :     /*
     118                 :      * We can't write WAL in recovery mode, so there's no point trying to
     119                 :      * clean the page. The primary will likely issue a cleaning WAL record
     120                 :      * soon anyway, so this is no particular loss.
     121                 :      */
     122        17752776 :     if (RecoveryInProgress())
     123        16621937 :         return;
     124                 : 
     125                 :     /*
     126                 :      * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
     127                 :      * currently are broken, and discussion of what to do about them is
     128                 :      * ongoing. See
     129                 :      * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
     130                 :      */
     131        17575599 :     if (old_snapshot_threshold == 0)
     132            2627 :         SnapshotTooOldMagicForTest();
     133                 : 
     134                 :     /*
     135                 :      * First check whether there's any chance there's something to prune,
     136                 :      * determining the appropriate horizon is a waste if there's no prune_xid
     137                 :      * (i.e. no updates/deletes left potentially dead tuples around).
     138                 :      */
     139        17575599 :     prune_xid = ((PageHeader) page)->pd_prune_xid;
     140        17575599 :     if (!TransactionIdIsValid(prune_xid))
     141        10033748 :         return;
     142                 : 
     143                 :     /*
     144                 :      * Check whether prune_xid indicates that there may be dead rows that can
     145                 :      * be cleaned up.
     146                 :      *
     147                 :      * It is OK to check the old snapshot limit before acquiring the cleanup
     148                 :      * lock because the worst that can happen is that we are not quite as
     149                 :      * aggressive about the cleanup (by however many transaction IDs are
     150                 :      * consumed between this point and acquiring the lock).  This allows us to
     151                 :      * save significant overhead in the case where the page is found not to be
     152                 :      * prunable.
     153                 :      *
     154                 :      * Even if old_snapshot_threshold is set, we first check whether the page
     155                 :      * can be pruned without. Both because
     156                 :      * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
     157                 :      * unnecessarily relying on old_snapshot_threshold avoids causing
     158                 :      * conflicts.
     159                 :      */
     160         7541851 :     vistest = GlobalVisTestFor(relation);
     161                 : 
     162         7541851 :     if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
     163                 :     {
     164         6410719 :         if (!OldSnapshotThresholdActive())
     165         6410698 :             return;
     166                 : 
     167              21 :         if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
     168                 :                                                  relation,
     169                 :                                                  &limited_xmin, &limited_ts))
     170              19 :             return;
     171                 : 
     172               2 :         if (!TransactionIdPrecedes(prune_xid, limited_xmin))
     173               2 :             return;
     174                 :     }
     175                 : 
     176                 :     /*
     177                 :      * We prune when a previous UPDATE failed to find enough space on the page
     178                 :      * for a new tuple version, or when free space falls below the relation's
     179                 :      * fill-factor target (but not less than 10%).
     180                 :      *
     181                 :      * Checking free space here is questionable since we aren't holding any
     182                 :      * lock on the buffer; in the worst case we could get a bogus answer. It's
     183                 :      * unlikely to be *seriously* wrong, though, since reading either pd_lower
     184                 :      * or pd_upper is probably atomic.  Avoiding taking a lock seems more
     185                 :      * important than sometimes getting a wrong answer in what is after all
     186                 :      * just a heuristic estimate.
     187                 :      */
     188         1131132 :     minfree = RelationGetTargetPageFreeSpace(relation,
     189                 :                                              HEAP_DEFAULT_FILLFACTOR);
     190         1131132 :     minfree = Max(minfree, BLCKSZ / 10);
     191                 : 
     192         1131132 :     if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
     193                 :     {
     194                 :         /* OK, try to get exclusive buffer lock */
     195          104122 :         if (!ConditionalLockBufferForCleanup(buffer))
     196             293 :             return;
     197                 : 
     198                 :         /*
     199                 :          * Now that we have buffer lock, get accurate information about the
     200                 :          * page's free space, and recheck the heuristic about whether to
     201                 :          * prune. (We needn't recheck PageIsPrunable, since no one else could
     202                 :          * have pruned while we hold pin.)
     203                 :          */
     204          103829 :         if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
     205                 :         {
     206                 :             int         ndeleted,
     207                 :                         nnewlpdead;
     208                 : 
     209          103829 :             ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
     210                 :                                        limited_ts, &nnewlpdead, NULL);
     211                 : 
     212                 :             /*
     213                 :              * Report the number of tuples reclaimed to pgstats.  This is
     214                 :              * ndeleted minus the number of newly-LP_DEAD-set items.
     215                 :              *
     216                 :              * We derive the number of dead tuples like this to avoid totally
     217                 :              * forgetting about items that were set to LP_DEAD, since they
     218                 :              * still need to be cleaned up by VACUUM.  We only want to count
     219                 :              * heap-only tuples that just became LP_UNUSED in our report,
     220                 :              * which don't.
     221                 :              *
     222                 :              * VACUUM doesn't have to compensate in the same way when it
     223                 :              * tracks ndeleted, since it will set the same LP_DEAD items to
     224                 :              * LP_UNUSED separately.
     225                 :              */
     226          103829 :             if (ndeleted > nnewlpdead)
     227           54279 :                 pgstat_update_heap_dead_tuples(relation,
     228                 :                                                ndeleted - nnewlpdead);
     229                 :         }
     230                 : 
     231                 :         /* And release buffer lock */
     232          103829 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
     233                 : 
     234                 :         /*
     235                 :          * We avoid reuse of any free space created on the page by unrelated
     236                 :          * UPDATEs/INSERTs by opting to not update the FSM at this point.  The
     237                 :          * free space should be reused by UPDATEs to *this* page.
     238                 :          */
     239                 :     }
     240                 : }
     241                 : 
     242                 : 
     243                 : /*
     244                 :  * Prune and repair fragmentation in the specified page.
     245                 :  *
     246                 :  * Caller must have pin and buffer cleanup lock on the page.  Note that we
     247                 :  * don't update the FSM information for page on caller's behalf.  Caller might
     248                 :  * also need to account for a reduction in the length of the line pointer
     249                 :  * array following array truncation by us.
     250                 :  *
     251                 :  * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
     252                 :  * (see heap_prune_satisfies_vacuum and
     253                 :  * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
     254                 :  * either have been set by TransactionIdLimitedForOldSnapshots, or
     255                 :  * InvalidTransactionId/0 respectively.
     256                 :  *
     257                 :  * Sets *nnewlpdead for caller, indicating the number of items that were
     258                 :  * newly set LP_DEAD during prune operation.
     259                 :  *
     260                 :  * off_loc is the offset location required by the caller to use in error
     261                 :  * callback.
     262                 :  *
     263                 :  * Returns the number of tuples deleted from the page during this call.
     264                 :  */
     265                 : int
     266          268918 : heap_page_prune(Relation relation, Buffer buffer,
     267                 :                 GlobalVisState *vistest,
     268                 :                 TransactionId old_snap_xmin,
     269                 :                 TimestampTz old_snap_ts,
     270                 :                 int *nnewlpdead,
     271                 :                 OffsetNumber *off_loc)
     272                 : {
     273          268918 :     int         ndeleted = 0;
     274          268918 :     Page        page = BufferGetPage(buffer);
     275          268918 :     BlockNumber blockno = BufferGetBlockNumber(buffer);
     276                 :     OffsetNumber offnum,
     277                 :                 maxoff;
     278                 :     PruneState  prstate;
     279                 :     HeapTupleData tup;
     280                 : 
     281                 :     /*
     282                 :      * Our strategy is to scan the page and make lists of items to change,
     283                 :      * then apply the changes within a critical section.  This keeps as much
     284                 :      * logic as possible out of the critical section, and also ensures that
     285                 :      * WAL replay will work the same as the normal case.
     286                 :      *
     287                 :      * First, initialize the new pd_prune_xid value to zero (indicating no
     288                 :      * prunable tuples).  If we find any tuples which may soon become
     289                 :      * prunable, we will save the lowest relevant XID in new_prune_xid. Also
     290                 :      * initialize the rest of our working state.
     291                 :      */
     292          268918 :     prstate.new_prune_xid = InvalidTransactionId;
     293          268918 :     prstate.rel = relation;
     294          268918 :     prstate.vistest = vistest;
     295          268918 :     prstate.old_snap_xmin = old_snap_xmin;
     296          268918 :     prstate.old_snap_ts = old_snap_ts;
     297          268918 :     prstate.old_snap_used = false;
     298 GNC      268918 :     prstate.snapshotConflictHorizon = InvalidTransactionId;
     299 CBC      268918 :     prstate.nredirected = prstate.ndead = prstate.nunused = 0;
     300          268918 :     memset(prstate.marked, 0, sizeof(prstate.marked));
     301                 : 
     302          268918 :     maxoff = PageGetMaxOffsetNumber(page);
     303          268918 :     tup.t_tableOid = RelationGetRelid(prstate.rel);
     304                 : 
     305                 :     /*
     306                 :      * Determine HTSV for all tuples.
     307                 :      *
     308                 :      * This is required for correctness to deal with cases where running HTSV
     309                 :      * twice could result in different results (e.g. RECENTLY_DEAD can turn to
     310                 :      * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
     311                 :      * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
     312                 :      * inserting transaction aborts, ...). That in turn could cause
     313                 :      * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
     314                 :      * once directly via a heap_prune_chain() and once following a HOT chain.
     315                 :      *
     316                 :      * It's also good for performance. Most commonly tuples within a page are
     317                 :      * stored at decreasing offsets (while the items are stored at increasing
     318                 :      * offsets). When processing all tuples on a page this leads to reading
     319                 :      * memory at decreasing offsets within a page, with a variable stride.
     320                 :      * That's hard for CPU prefetchers to deal with. Processing the items in
     321                 :      * reverse order (and thus the tuples in increasing order) increases
     322                 :      * prefetching efficiency significantly / decreases the number of cache
     323                 :      * misses.
     324                 :      */
     325          268918 :     for (offnum = maxoff;
     326        16690039 :          offnum >= FirstOffsetNumber;
     327        16421121 :          offnum = OffsetNumberPrev(offnum))
     328                 :     {
     329        16421121 :         ItemId      itemid = PageGetItemId(page, offnum);
     330                 :         HeapTupleHeader htup;
     331                 : 
     332                 :         /* Nothing to do if slot doesn't contain a tuple */
     333        16421121 :         if (!ItemIdIsNormal(itemid))
     334                 :         {
     335         1683379 :             prstate.htsv[offnum] = -1;
     336         1683379 :             continue;
     337                 :         }
     338                 : 
     339        14737742 :         htup = (HeapTupleHeader) PageGetItem(page, itemid);
     340        14737742 :         tup.t_data = htup;
     341        14737742 :         tup.t_len = ItemIdGetLength(itemid);
     342        14737742 :         ItemPointerSet(&(tup.t_self), blockno, offnum);
     343                 : 
     344                 :         /*
     345                 :          * Set the offset number so that we can display it along with any
     346                 :          * error that occurred while processing this tuple.
     347                 :          */
     348        14737742 :         if (off_loc)
     349         9682613 :             *off_loc = offnum;
     350                 : 
     351        14737742 :         prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
     352                 :                                                            buffer);
     353                 :     }
     354                 : 
     355                 :     /* Scan the page */
     356          268918 :     for (offnum = FirstOffsetNumber;
     357        16690039 :          offnum <= maxoff;
     358        16421121 :          offnum = OffsetNumberNext(offnum))
     359                 :     {
     360                 :         ItemId      itemid;
     361                 : 
     362                 :         /* Ignore items already processed as part of an earlier chain */
     363        16421121 :         if (prstate.marked[offnum])
     364          193811 :             continue;
     365                 : 
     366                 :         /* see preceding loop */
     367        16227310 :         if (off_loc)
     368        10145699 :             *off_loc = offnum;
     369                 : 
     370                 :         /* Nothing to do if slot is empty or already dead */
     371        16227310 :         itemid = PageGetItemId(page, offnum);
     372        16227310 :         if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
     373         1098407 :             continue;
     374                 : 
     375                 :         /* Process this item or chain of items */
     376        15128903 :         ndeleted += heap_prune_chain(buffer, offnum, &prstate);
     377                 :     }
     378                 : 
     379                 :     /* Clear the offset information once we have processed the given page. */
     380          268918 :     if (off_loc)
     381          165089 :         *off_loc = InvalidOffsetNumber;
     382                 : 
     383                 :     /* Any error while applying the changes is critical */
     384          268918 :     START_CRIT_SECTION();
     385                 : 
     386                 :     /* Have we found any prunable items? */
     387          268918 :     if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
     388                 :     {
     389                 :         /*
     390                 :          * Apply the planned item changes, then repair page fragmentation, and
     391                 :          * update the page's hint bit about whether it has free line pointers.
     392                 :          */
     393          112841 :         heap_page_prune_execute(buffer,
     394                 :                                 prstate.redirected, prstate.nredirected,
     395                 :                                 prstate.nowdead, prstate.ndead,
     396                 :                                 prstate.nowunused, prstate.nunused);
     397                 : 
     398                 :         /*
     399                 :          * Update the page's pd_prune_xid field to either zero, or the lowest
     400                 :          * XID of any soon-prunable tuple.
     401                 :          */
     402          112841 :         ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
     403                 : 
     404                 :         /*
     405                 :          * Also clear the "page is full" flag, since there's no point in
     406                 :          * repeating the prune/defrag process until something else happens to
     407                 :          * the page.
     408                 :          */
     409          112841 :         PageClearFull(page);
     410                 : 
     411          112841 :         MarkBufferDirty(buffer);
     412                 : 
     413                 :         /*
     414                 :          * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
     415                 :          */
     416          112841 :         if (RelationNeedsWAL(relation))
     417                 :         {
     418                 :             xl_heap_prune xlrec;
     419                 :             XLogRecPtr  recptr;
     420                 : 
     421 GNC      112027 :             xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation);
     422          112027 :             xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon;
     423 CBC      112027 :             xlrec.nredirected = prstate.nredirected;
     424          112027 :             xlrec.ndead = prstate.ndead;
     425 ECB             : 
     426 GIC      112027 :             XLogBeginInsert();
     427 CBC      112027 :             XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
     428 ECB             : 
     429 GIC      112027 :             XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
     430 ECB             : 
     431                 :             /*
     432                 :              * The OffsetNumber arrays are not actually in the buffer, but we
     433                 :              * pretend that they are.  When XLogInsert stores the whole
     434                 :              * buffer, the offset arrays need not be stored too.
     435                 :              */
     436 GIC      112027 :             if (prstate.nredirected > 0)
     437 CBC       53622 :                 XLogRegisterBufData(0, (char *) prstate.redirected,
     438           53622 :                                     prstate.nredirected *
     439 ECB             :                                     sizeof(OffsetNumber) * 2);
     440                 : 
     441 GIC      112027 :             if (prstate.ndead > 0)
     442 CBC       62485 :                 XLogRegisterBufData(0, (char *) prstate.nowdead,
     443           62485 :                                     prstate.ndead * sizeof(OffsetNumber));
     444 ECB             : 
     445 GIC      112027 :             if (prstate.nunused > 0)
     446 CBC       17957 :                 XLogRegisterBufData(0, (char *) prstate.nowunused,
     447           17957 :                                     prstate.nunused * sizeof(OffsetNumber));
     448 ECB             : 
     449 GIC      112027 :             recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
     450 ECB             : 
     451 GIC      112027 :             PageSetLSN(BufferGetPage(buffer), recptr);
     452 ECB             :         }
     453                 :     }
     454                 :     else
     455                 :     {
     456                 :         /*
     457                 :          * If we didn't prune anything, but have found a new value for the
     458                 :          * pd_prune_xid field, update it and mark the buffer dirty. This is
     459                 :          * treated as a non-WAL-logged hint.
     460                 :          *
     461                 :          * Also clear the "page is full" flag if it is set, since there's no
     462                 :          * point in repeating the prune/defrag process until something else
     463                 :          * happens to the page.
     464                 :          */
     465 GIC      312060 :         if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
     466 CBC      155983 :             PageIsFull(page))
     467 ECB             :         {
     468 GIC         192 :             ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
     469 CBC         192 :             PageClearFull(page);
     470             192 :             MarkBufferDirtyHint(buffer, true);
     471 ECB             :         }
     472                 :     }
     473                 : 
     474 GIC      268918 :     END_CRIT_SECTION();
     475 ECB             : 
     476                 :     /* Record number of newly-set-LP_DEAD items for caller */
     477 GIC      268918 :     *nnewlpdead = prstate.ndead;
     478 ECB             : 
     479 GIC      268918 :     return ndeleted;
     480 ECB             : }
     481                 : 
     482                 : 
     483                 : /*
     484                 :  * Perform visibility checks for heap pruning.
     485                 :  *
     486                 :  * This is more complicated than just using GlobalVisTestIsRemovableXid()
     487                 :  * because of old_snapshot_threshold. We only want to increase the threshold
     488                 :  * that triggers errors for old snapshots when we actually decide to remove a
     489                 :  * row based on the limited horizon.
     490                 :  *
     491                 :  * Due to its cost we also only want to call
     492                 :  * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
     493                 :  * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
     494                 :  * still want to be able to remove rows that are too new to be removed
     495                 :  * according to prstate->vistest, but that can be removed based on
     496                 :  * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
     497                 :  * demand in here, if appropriate.
     498                 :  */
     499                 : static HTSV_Result
     500 GIC    14737742 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
     501 ECB             : {
     502                 :     HTSV_Result res;
     503                 :     TransactionId dead_after;
     504                 : 
     505 GIC    14737742 :     res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
     506 ECB             : 
     507 GIC    14737742 :     if (res != HEAPTUPLE_RECENTLY_DEAD)
     508 CBC    12938086 :         return res;
     509 ECB             : 
     510                 :     /*
     511                 :      * If we are already relying on the limited xmin, there is no need to
     512                 :      * delay doing so anymore.
     513                 :      */
     514 GIC     1799656 :     if (prstate->old_snap_used)
     515 ECB             :     {
     516 UIC           0 :         Assert(TransactionIdIsValid(prstate->old_snap_xmin));
     517 EUB             : 
     518 UIC           0 :         if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
     519 UBC           0 :             res = HEAPTUPLE_DEAD;
     520               0 :         return res;
     521 EUB             :     }
     522                 : 
     523                 :     /*
     524                 :      * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
     525                 :      * row dead. If not, and old_snapshot_threshold is enabled, try to use the
     526                 :      * lowered horizon.
     527                 :      */
     528 GIC     1799656 :     if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
     529 CBC     1459724 :         res = HEAPTUPLE_DEAD;
     530          339932 :     else if (OldSnapshotThresholdActive())
     531 ECB             :     {
     532                 :         /* haven't determined limited horizon yet, requests */
     533 UIC           0 :         if (!TransactionIdIsValid(prstate->old_snap_xmin))
     534 EUB             :         {
     535                 :             TransactionId horizon =
     536 UIC           0 :             GlobalVisTestNonRemovableHorizon(prstate->vistest);
     537 EUB             : 
     538 UIC           0 :             TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
     539 EUB             :                                                 &prstate->old_snap_xmin,
     540                 :                                                 &prstate->old_snap_ts);
     541                 :         }
     542                 : 
     543 UIC           0 :         if (TransactionIdIsValid(prstate->old_snap_xmin) &&
     544 UBC           0 :             TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
     545 EUB             :         {
     546                 :             /*
     547                 :              * About to remove row based on snapshot_too_old. Need to raise
     548                 :              * the threshold so problematic accesses would error.
     549                 :              */
     550 UIC           0 :             Assert(!prstate->old_snap_used);
     551 UBC           0 :             SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
     552 EUB             :                                              prstate->old_snap_xmin);
     553 UIC           0 :             prstate->old_snap_used = true;
     554 UBC           0 :             res = HEAPTUPLE_DEAD;
     555 EUB             :         }
     556                 :     }
     557                 : 
     558 GIC     1799656 :     return res;
     559 ECB             : }
     560                 : 
     561                 : 
     562                 : /*
     563                 :  * Prune specified line pointer or a HOT chain originating at line pointer.
     564                 :  *
     565                 :  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
     566                 :  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
     567                 :  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
     568                 :  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
     569                 :  * DEAD, our visibility test is just too coarse to detect it.
     570                 :  *
     571                 :  * In general, pruning must never leave behind a DEAD tuple that still has
     572                 :  * tuple storage.  VACUUM isn't prepared to deal with that case.  That's why
     573                 :  * VACUUM prunes the same heap page a second time (without dropping its lock
     574                 :  * in the interim) when it sees a newly DEAD tuple that we initially saw as
     575                 :  * in-progress.  Retrying pruning like this can only happen when an inserting
     576                 :  * transaction concurrently aborts.
     577                 :  *
     578                 :  * The root line pointer is redirected to the tuple immediately after the
     579                 :  * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
     580                 :  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
     581                 :  * tuple, which we treat as a chain of length 1.)
     582                 :  *
     583                 :  * We don't actually change the page here. We just add entries to the arrays in
     584                 :  * prstate showing the changes to be made.  Items to be redirected are added
     585                 :  * to the redirected[] array (two entries per redirection); items to be set to
     586                 :  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
     587                 :  * state are added to nowunused[].
     588                 :  *
     589                 :  * Returns the number of tuples (to be) deleted from the page.
     590                 :  */
     591                 : static int
     592 GIC    15128903 : heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
     593 ECB             : {
     594 GIC    15128903 :     int         ndeleted = 0;
     595 CBC    15128903 :     Page        dp = (Page) BufferGetPage(buffer);
     596        15128903 :     TransactionId priorXmax = InvalidTransactionId;
     597 ECB             :     ItemId      rootlp;
     598                 :     HeapTupleHeader htup;
     599 GIC    15128903 :     OffsetNumber latestdead = InvalidOffsetNumber,
     600 CBC    15128903 :                 maxoff = PageGetMaxOffsetNumber(dp),
     601 ECB             :                 offnum;
     602                 :     OffsetNumber chainitems[MaxHeapTuplesPerPage];
     603 GIC    15128903 :     int         nchain = 0,
     604 ECB             :                 i;
     605                 : 
     606 GIC    15128903 :     rootlp = PageGetItemId(dp, rootoffnum);
     607 ECB             : 
     608                 :     /*
     609                 :      * If it's a heap-only tuple, then it is not the start of a HOT chain.
     610                 :      */
     611 GIC    15128903 :     if (ItemIdIsNormal(rootlp))
     612 ECB             :     {
     613 GIC    14543931 :         Assert(prstate->htsv[rootoffnum] != -1);
     614 CBC    14543931 :         htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
     615 ECB             : 
     616 GIC    14543931 :         if (HeapTupleHeaderIsHeapOnly(htup))
     617 ECB             :         {
     618                 :             /*
     619                 :              * If the tuple is DEAD and doesn't chain to anything else, mark
     620                 :              * it unused immediately.  (If it does chain, we can only remove
     621                 :              * it as part of pruning its chain.)
     622                 :              *
     623                 :              * We need this primarily to handle aborted HOT updates, that is,
     624                 :              * XMIN_INVALID heap-only tuples.  Those might not be linked to by
     625                 :              * any chain, since the parent tuple might be re-updated before
     626                 :              * any pruning occurs.  So we have to be able to reap them
     627                 :              * separately from chain-pruning.  (Note that
     628                 :              * HeapTupleHeaderIsHotUpdated will never return true for an
     629                 :              * XMIN_INVALID tuple, so this code will work even when there were
     630                 :              * sequential updates within the aborted transaction.)
     631                 :              *
     632                 :              * Note that we might first arrive at a dead heap-only tuple
     633                 :              * either here or while following a chain below.  Whichever path
     634                 :              * gets there first will mark the tuple unused.
     635                 :              */
     636 GIC      579696 :             if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
     637 CBC        7238 :                 !HeapTupleHeaderIsHotUpdated(htup))
     638 ECB             :             {
     639 GIC        3132 :                 heap_prune_record_unused(prstate, rootoffnum);
     640 GNC        3132 :                 HeapTupleHeaderAdvanceConflictHorizon(htup,
     641                 :                                                       &prstate->snapshotConflictHorizon);
     642 GIC        3132 :                 ndeleted++;
     643 ECB             :             }
     644                 : 
     645                 :             /* Nothing more to do */
     646 GIC      579696 :             return ndeleted;
     647 ECB             :         }
     648                 :     }
     649                 : 
     650                 :     /* Start from the root tuple */
     651 GIC    14549207 :     offnum = rootoffnum;
     652 ECB             : 
     653                 :     /* while not end of the chain */
     654                 :     for (;;)
     655 GIC      761000 :     {
     656 ECB             :         ItemId      lp;
     657                 :         bool        tupdead,
     658                 :                     recent_dead;
     659                 : 
     660                 :         /* Sanity check (pure paranoia) */
     661 GIC    15310207 :         if (offnum < FirstOffsetNumber)
     662 LBC           0 :             break;
     663 EUB             : 
     664                 :         /*
     665                 :          * An offset past the end of page's line pointer array is possible
     666                 :          * when the array was truncated (original item must have been unused)
     667                 :          */
     668 GIC    15310207 :         if (offnum > maxoff)
     669 LBC           0 :             break;
     670 EUB             : 
     671                 :         /* If item is already processed, stop --- it must not be same chain */
     672 GIC    15310207 :         if (prstate->marked[offnum])
     673 CBC        1907 :             break;
     674 ECB             : 
     675 GIC    15308300 :         lp = PageGetItemId(dp, offnum);
     676 ECB             : 
     677                 :         /* Unused item obviously isn't part of the chain */
     678 GIC    15308300 :         if (!ItemIdIsUsed(lp))
     679 LBC           0 :             break;
     680 EUB             : 
     681                 :         /*
     682                 :          * If we are looking at the redirected root line pointer, jump to the
     683                 :          * first normal tuple in the chain.  If we find a redirect somewhere
     684                 :          * else, stop --- it must not be same chain.
     685                 :          */
     686 GIC    15308300 :         if (ItemIdIsRedirected(lp))
     687 ECB             :         {
     688 GIC      584972 :             if (nchain > 0)
     689 LBC           0 :                 break;          /* not at start of chain */
     690 GBC      584972 :             chainitems[nchain++] = offnum;
     691 CBC      584972 :             offnum = ItemIdGetRedirect(rootlp);
     692          584972 :             continue;
     693 ECB             :         }
     694                 : 
     695                 :         /*
     696                 :          * Likewise, a dead line pointer can't be part of the chain. (We
     697                 :          * already eliminated the case of dead root tuple outside this
     698                 :          * function.)
     699                 :          */
     700 GIC    14723328 :         if (ItemIdIsDead(lp))
     701 LBC           0 :             break;
     702 EUB             : 
     703 GIC    14723328 :         Assert(ItemIdIsNormal(lp));
     704 CBC    14723328 :         Assert(prstate->htsv[offnum] != -1);
     705        14723328 :         htup = (HeapTupleHeader) PageGetItem(dp, lp);
     706 ECB             : 
     707                 :         /*
     708                 :          * Check the tuple XMIN against prior XMAX, if any
     709                 :          */
     710 GIC    14898367 :         if (TransactionIdIsValid(priorXmax) &&
     711 CBC      175039 :             !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
     712 LBC           0 :             break;
     713 EUB             : 
     714                 :         /*
     715                 :          * OK, this tuple is indeed a member of the chain.
     716                 :          */
     717 GIC    14723328 :         chainitems[nchain++] = offnum;
     718 ECB             : 
     719                 :         /*
     720                 :          * Check tuple's visibility status.
     721                 :          */
     722 GIC    14723328 :         tupdead = recent_dead = false;
     723 ECB             : 
     724 GIC    14723328 :         switch ((HTSV_Result) prstate->htsv[offnum])
     725 ECB             :         {
     726 GIC     1476657 :             case HEAPTUPLE_DEAD:
     727 CBC     1476657 :                 tupdead = true;
     728         1476657 :                 break;
     729 ECB             : 
     730 GIC      339932 :             case HEAPTUPLE_RECENTLY_DEAD:
     731 CBC      339932 :                 recent_dead = true;
     732 ECB             : 
     733                 :                 /*
     734                 :                  * This tuple may soon become DEAD.  Update the hint field so
     735                 :                  * that the page is reconsidered for pruning in future.
     736                 :                  */
     737 GIC      339932 :                 heap_prune_record_prunable(prstate,
     738 CBC      339932 :                                            HeapTupleHeaderGetUpdateXid(htup));
     739          339932 :                 break;
     740 ECB             : 
     741 GIC       14271 :             case HEAPTUPLE_DELETE_IN_PROGRESS:
     742 ECB             : 
     743                 :                 /*
     744                 :                  * This tuple may soon become DEAD.  Update the hint field so
     745                 :                  * that the page is reconsidered for pruning in future.
     746                 :                  */
     747 GIC       14271 :                 heap_prune_record_prunable(prstate,
     748 CBC       14271 :                                            HeapTupleHeaderGetUpdateXid(htup));
     749           14271 :                 break;
     750 ECB             : 
     751 GIC    12892468 :             case HEAPTUPLE_LIVE:
     752 ECB             :             case HEAPTUPLE_INSERT_IN_PROGRESS:
     753                 : 
     754                 :                 /*
     755                 :                  * If we wanted to optimize for aborts, we might consider
     756                 :                  * marking the page prunable when we see INSERT_IN_PROGRESS.
     757                 :                  * But we don't.  See related decisions about when to mark the
     758                 :                  * page prunable in heapam.c.
     759                 :                  */
     760 GIC    12892468 :                 break;
     761 ECB             : 
     762 UIC           0 :             default:
     763 UBC           0 :                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
     764 EUB             :                 break;
     765                 :         }
     766                 : 
     767                 :         /*
     768                 :          * Remember the last DEAD tuple seen.  We will advance past
     769                 :          * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
     770                 :          * but we can't advance past anything else.  We have to make sure that
     771                 :          * we don't miss any DEAD tuples, since DEAD tuples that still have
     772                 :          * tuple storage after pruning will confuse VACUUM.
     773                 :          */
     774 GIC    14723328 :         if (tupdead)
     775 ECB             :         {
     776 GIC     1476657 :             latestdead = offnum;
     777 GNC     1476657 :             HeapTupleHeaderAdvanceConflictHorizon(htup,
     778                 :                                                   &prstate->snapshotConflictHorizon);
     779                 :         }
     780 GIC    13246671 :         else if (!recent_dead)
     781 CBC    12906739 :             break;
     782 ECB             : 
     783                 :         /*
     784                 :          * If the tuple is not HOT-updated, then we are at the end of this
     785                 :          * HOT-update chain.
     786                 :          */
     787 GIC     1816589 :         if (!HeapTupleHeaderIsHotUpdated(htup))
     788 ECB             :             break;
     789                 : 
     790                 :         /* HOT implies it can't have moved to different partition */
     791 GIC      176028 :         Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
     792 ECB             : 
     793                 :         /*
     794                 :          * Advance to next chain member.
     795                 :          */
     796 GIC      176028 :         Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
     797 ECB             :                BufferGetBlockNumber(buffer));
     798 GIC      176028 :         offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
     799 CBC      176028 :         priorXmax = HeapTupleHeaderGetUpdateXid(htup);
     800 ECB             :     }
     801                 : 
     802                 :     /*
     803                 :      * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
     804                 :      * the DEAD tuples at the start of the chain are removed and the root line
     805                 :      * pointer is appropriately redirected.
     806                 :      */
     807 GIC    14549207 :     if (OffsetNumberIsValid(latestdead))
     808 ECB             :     {
     809                 :         /*
     810                 :          * Mark as unused each intermediate item that we are able to remove
     811                 :          * from the chain.
     812                 :          *
     813                 :          * When the previous item is the last dead tuple seen, we are at the
     814                 :          * right candidate for redirection.
     815                 :          */
     816 GIC     1509961 :         for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
     817 ECB             :         {
     818 GIC       74717 :             heap_prune_record_unused(prstate, chainitems[i]);
     819 CBC       74717 :             ndeleted++;
     820 ECB             :         }
     821                 : 
     822                 :         /*
     823                 :          * If the root entry had been a normal tuple, we are deleting it, so
     824                 :          * count it in the result.  But changing a redirect (even to DEAD
     825                 :          * state) doesn't count.
     826                 :          */
     827 GIC     1435244 :         if (ItemIdIsNormal(rootlp))
     828 CBC     1401940 :             ndeleted++;
     829 ECB             : 
     830                 :         /*
     831                 :          * If the DEAD tuple is at the end of the chain, the entire chain is
     832                 :          * dead and the root line pointer can be marked dead.  Otherwise just
     833                 :          * redirect the root to the correct chain member.
     834                 :          */
     835 GIC     1435244 :         if (i >= nchain)
     836 CBC     1305612 :             heap_prune_record_dead(prstate, rootoffnum);
     837 ECB             :         else
     838 GIC      129632 :             heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
     839 ECB             :     }
     840 GIC    13113963 :     else if (nchain < 2 && ItemIdIsRedirected(rootlp))
     841 ECB             :     {
     842                 :         /*
     843                 :          * We found a redirect item that doesn't point to a valid follow-on
     844                 :          * item.  This can happen if the loop in heap_page_prune caused us to
     845                 :          * visit the dead successor of a redirect item before visiting the
     846                 :          * redirect item.  We can clean up by setting the redirect item to
     847                 :          * DEAD state.
     848                 :          */
     849 GIC         918 :         heap_prune_record_dead(prstate, rootoffnum);
     850 ECB             :     }
     851                 : 
     852 GIC    14549207 :     return ndeleted;
     853 ECB             : }
     854                 : 
     855                 : /* Record lowest soon-prunable XID */
     856                 : static void
     857 GIC      354203 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
     858 ECB             : {
     859                 :     /*
     860                 :      * This should exactly match the PageSetPrunable macro.  We can't store
     861                 :      * directly into the page header yet, so we update working state.
     862                 :      */
     863 GIC      354203 :     Assert(TransactionIdIsNormal(xid));
     864 CBC      698363 :     if (!TransactionIdIsValid(prstate->new_prune_xid) ||
     865          344160 :         TransactionIdPrecedes(xid, prstate->new_prune_xid))
     866           11185 :         prstate->new_prune_xid = xid;
     867          354203 : }
     868 ECB             : 
     869                 : /* Record line pointer to be redirected */
     870                 : static void
     871 GIC      129632 : heap_prune_record_redirect(PruneState *prstate,
     872 ECB             :                            OffsetNumber offnum, OffsetNumber rdoffnum)
     873                 : {
     874 GIC      129632 :     Assert(prstate->nredirected < MaxHeapTuplesPerPage);
     875 CBC      129632 :     prstate->redirected[prstate->nredirected * 2] = offnum;
     876          129632 :     prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
     877          129632 :     prstate->nredirected++;
     878          129632 :     Assert(!prstate->marked[offnum]);
     879          129632 :     prstate->marked[offnum] = true;
     880          129632 :     Assert(!prstate->marked[rdoffnum]);
     881          129632 :     prstate->marked[rdoffnum] = true;
     882          129632 : }
     883 ECB             : 
     884                 : /* Record line pointer to be marked dead */
     885                 : static void
     886 GIC     1306530 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
     887 ECB             : {
     888 GIC     1306530 :     Assert(prstate->ndead < MaxHeapTuplesPerPage);
     889 CBC     1306530 :     prstate->nowdead[prstate->ndead] = offnum;
     890         1306530 :     prstate->ndead++;
     891         1306530 :     Assert(!prstate->marked[offnum]);
     892         1306530 :     prstate->marked[offnum] = true;
     893         1306530 : }
     894 ECB             : 
     895                 : /* Record line pointer to be marked unused */
     896                 : static void
     897 GIC       77849 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
     898 ECB             : {
     899 GIC       77849 :     Assert(prstate->nunused < MaxHeapTuplesPerPage);
     900 CBC       77849 :     prstate->nowunused[prstate->nunused] = offnum;
     901           77849 :     prstate->nunused++;
     902           77849 :     Assert(!prstate->marked[offnum]);
     903           77849 :     prstate->marked[offnum] = true;
     904           77849 : }
     905 ECB             : 
     906                 : 
     907                 : /*
     908                 :  * Perform the actual page changes needed by heap_page_prune.
     909                 :  * It is expected that the caller has a full cleanup lock on the
     910                 :  * buffer.
     911                 :  */
     912                 : void
     913 GIC      118566 : heap_page_prune_execute(Buffer buffer,
     914 ECB             :                         OffsetNumber *redirected, int nredirected,
     915                 :                         OffsetNumber *nowdead, int ndead,
     916                 :                         OffsetNumber *nowunused, int nunused)
     917                 : {
     918 GIC      118566 :     Page        page = (Page) BufferGetPage(buffer);
     919 ECB             :     OffsetNumber *offnum;
     920                 :     HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
     921                 : 
     922                 :     /* Shouldn't be called unless there's something to do */
     923 GIC      118566 :     Assert(nredirected > 0 || ndead > 0 || nunused > 0);
     924 ECB             : 
     925                 :     /* Update all redirected line pointers */
     926 GIC      118566 :     offnum = redirected;
     927 CBC      261341 :     for (int i = 0; i < nredirected; i++)
     928 ECB             :     {
     929 GIC      142775 :         OffsetNumber fromoff = *offnum++;
     930 CBC      142775 :         OffsetNumber tooff = *offnum++;
     931          142775 :         ItemId      fromlp = PageGetItemId(page, fromoff);
     932 ECB             :         ItemId      tolp PG_USED_FOR_ASSERTS_ONLY;
     933                 : 
     934                 : #ifdef USE_ASSERT_CHECKING
     935                 : 
     936                 :         /*
     937                 :          * Any existing item that we set as an LP_REDIRECT (any 'from' item)
     938                 :          * must be the first item from a HOT chain.  If the item has tuple
     939                 :          * storage then it can't be a heap-only tuple.  Otherwise we are just
     940                 :          * maintaining an existing LP_REDIRECT from an existing HOT chain that
     941                 :          * has been pruned at least once before now.
     942                 :          */
     943 GIC      142775 :         if (!ItemIdIsRedirected(fromlp))
     944 ECB             :         {
     945 GIC      132479 :             Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
     946 ECB             : 
     947 GIC      132479 :             htup = (HeapTupleHeader) PageGetItem(page, fromlp);
     948 CBC      132479 :             Assert(!HeapTupleHeaderIsHeapOnly(htup));
     949 ECB             :         }
     950                 :         else
     951                 :         {
     952                 :             /* We shouldn't need to redundantly set the redirect */
     953 GIC       10296 :             Assert(ItemIdGetRedirect(fromlp) != tooff);
     954 ECB             :         }
     955                 : 
     956                 :         /*
     957                 :          * The item that we're about to set as an LP_REDIRECT (the 'from'
     958                 :          * item) will point to an existing item (the 'to' item) that is
     959                 :          * already a heap-only tuple.  There can be at most one LP_REDIRECT
     960                 :          * item per HOT chain.
     961                 :          *
     962                 :          * We need to keep around an LP_REDIRECT item (after original
     963                 :          * non-heap-only root tuple gets pruned away) so that it's always
     964                 :          * possible for VACUUM to easily figure out what TID to delete from
     965                 :          * indexes when an entire HOT chain becomes dead.  A heap-only tuple
     966                 :          * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
     967                 :          * tuple can.
     968                 :          *
     969                 :          * This check may miss problems, e.g. the target of a redirect could
     970                 :          * be marked as unused subsequently. The page_verify_redirects() check
     971                 :          * below will catch such problems.
     972                 :          */
     973 GIC      142775 :         tolp = PageGetItemId(page, tooff);
     974 CBC      142775 :         Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
     975          142775 :         htup = (HeapTupleHeader) PageGetItem(page, tolp);
     976          142775 :         Assert(HeapTupleHeaderIsHeapOnly(htup));
     977 ECB             : #endif
     978                 : 
     979 GIC      142775 :         ItemIdSetRedirect(fromlp, tooff);
     980 ECB             :     }
     981                 : 
     982                 :     /* Update all now-dead line pointers */
     983 GIC      118566 :     offnum = nowdead;
     984 CBC     1685949 :     for (int i = 0; i < ndead; i++)
     985 ECB             :     {
     986 GIC     1567383 :         OffsetNumber off = *offnum++;
     987 CBC     1567383 :         ItemId      lp = PageGetItemId(page, off);
     988 ECB             : 
     989                 : #ifdef USE_ASSERT_CHECKING
     990                 : 
     991                 :         /*
     992                 :          * An LP_DEAD line pointer must be left behind when the original item
     993                 :          * (which is dead to everybody) could still be referenced by a TID in
     994                 :          * an index.  This should never be necessary with any individual
     995                 :          * heap-only tuple item, though. (It's not clear how much of a problem
     996                 :          * that would be, but there is no reason to allow it.)
     997                 :          */
     998 GIC     1567383 :         if (ItemIdHasStorage(lp))
     999 ECB             :         {
    1000 GIC     1542159 :             Assert(ItemIdIsNormal(lp));
    1001 CBC     1542159 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    1002         1542159 :             Assert(!HeapTupleHeaderIsHeapOnly(htup));
    1003 ECB             :         }
    1004                 :         else
    1005                 :         {
    1006                 :             /* Whole HOT chain becomes dead */
    1007 GIC       25224 :             Assert(ItemIdIsRedirected(lp));
    1008 ECB             :         }
    1009                 : #endif
    1010                 : 
    1011 GIC     1567383 :         ItemIdSetDead(lp);
    1012 ECB             :     }
    1013                 : 
    1014                 :     /* Update all now-unused line pointers */
    1015 GIC      118566 :     offnum = nowunused;
    1016 CBC      203209 :     for (int i = 0; i < nunused; i++)
    1017 ECB             :     {
    1018 GIC       84643 :         OffsetNumber off = *offnum++;
    1019 CBC       84643 :         ItemId      lp = PageGetItemId(page, off);
    1020 ECB             : 
    1021                 : #ifdef USE_ASSERT_CHECKING
    1022                 : 
    1023                 :         /*
    1024                 :          * Only heap-only tuples can become LP_UNUSED during pruning.  They
    1025                 :          * don't need to be left in place as LP_DEAD items until VACUUM gets
    1026                 :          * around to doing index vacuuming.
    1027                 :          */
    1028 GIC       84643 :         Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
    1029 CBC       84643 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    1030           84643 :         Assert(HeapTupleHeaderIsHeapOnly(htup));
    1031 ECB             : #endif
    1032                 : 
    1033 GIC       84643 :         ItemIdSetUnused(lp);
    1034 ECB             :     }
    1035                 : 
    1036                 :     /*
    1037                 :      * Finally, repair any fragmentation, and update the page's hint bit about
    1038                 :      * whether it has free pointers.
    1039                 :      */
    1040 GIC      118566 :     PageRepairFragmentation(page);
    1041 ECB             : 
    1042                 :     /*
    1043                 :      * Now that the page has been modified, assert that redirect items still
    1044                 :      * point to valid targets.
    1045                 :      */
    1046 GIC      118566 :     page_verify_redirects(page);
    1047 CBC      118566 : }
    1048 ECB             : 
    1049                 : 
    1050                 : /*
    1051                 :  * If built with assertions, verify that all LP_REDIRECT items point to a
    1052                 :  * valid item.
    1053                 :  *
    1054                 :  * One way that bugs related to HOT pruning show is redirect items pointing to
    1055                 :  * removed tuples. It's not trivial to reliably check that marking an item
    1056                 :  * unused will not orphan a redirect item during heap_prune_chain() /
    1057                 :  * heap_page_prune_execute(), so we additionally check the whole page after
    1058                 :  * pruning. Without this check such bugs would typically only cause asserts
    1059                 :  * later, potentially well after the corruption has been introduced.
    1060                 :  *
    1061                 :  * Also check comments in heap_page_prune_execute()'s redirection loop.
    1062                 :  */
    1063                 : static void
    1064 GIC      118566 : page_verify_redirects(Page page)
    1065 ECB             : {
    1066                 : #ifdef USE_ASSERT_CHECKING
    1067                 :     OffsetNumber offnum;
    1068                 :     OffsetNumber maxoff;
    1069                 : 
    1070 GIC      118566 :     maxoff = PageGetMaxOffsetNumber(page);
    1071 CBC      118566 :     for (offnum = FirstOffsetNumber;
    1072         7809329 :          offnum <= maxoff;
    1073         7690763 :          offnum = OffsetNumberNext(offnum))
    1074 ECB             :     {
    1075 GIC     7690763 :         ItemId      itemid = PageGetItemId(page, offnum);
    1076 ECB             :         OffsetNumber targoff;
    1077                 :         ItemId      targitem;
    1078                 :         HeapTupleHeader htup;
    1079                 : 
    1080 GIC     7690763 :         if (!ItemIdIsRedirected(itemid))
    1081 CBC     7044702 :             continue;
    1082 ECB             : 
    1083 GIC      646061 :         targoff = ItemIdGetRedirect(itemid);
    1084 CBC      646061 :         targitem = PageGetItemId(page, targoff);
    1085 ECB             : 
    1086 GIC      646061 :         Assert(ItemIdIsUsed(targitem));
    1087 CBC      646061 :         Assert(ItemIdIsNormal(targitem));
    1088          646061 :         Assert(ItemIdHasStorage(targitem));
    1089          646061 :         htup = (HeapTupleHeader) PageGetItem(page, targitem);
    1090          646061 :         Assert(HeapTupleHeaderIsHeapOnly(htup));
    1091 ECB             :     }
    1092                 : #endif
    1093 GIC      118566 : }
    1094 ECB             : 
    1095                 : 
    1096                 : /*
    1097                 :  * For all items in this page, find their respective root line pointers.
    1098                 :  * If item k is part of a HOT-chain with root at item j, then we set
    1099                 :  * root_offsets[k - 1] = j.
    1100                 :  *
    1101                 :  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
    1102                 :  * Unused entries are filled with InvalidOffsetNumber (zero).
    1103                 :  *
    1104                 :  * The function must be called with at least share lock on the buffer, to
    1105                 :  * prevent concurrent prune operations.
    1106                 :  *
    1107                 :  * Note: The information collected here is valid only as long as the caller
    1108                 :  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
    1109                 :  * and reused by a completely unrelated tuple.
    1110                 :  */
    1111                 : void
    1112 GIC      204041 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
    1113 ECB             : {
    1114                 :     OffsetNumber offnum,
    1115                 :                 maxoff;
    1116                 : 
    1117 GIC      204041 :     MemSet(root_offsets, InvalidOffsetNumber,
    1118 ECB             :            MaxHeapTuplesPerPage * sizeof(OffsetNumber));
    1119                 : 
    1120 GIC      204041 :     maxoff = PageGetMaxOffsetNumber(page);
    1121 CBC    14254285 :     for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
    1122 ECB             :     {
    1123 GIC    14050244 :         ItemId      lp = PageGetItemId(page, offnum);
    1124 ECB             :         HeapTupleHeader htup;
    1125                 :         OffsetNumber nextoffnum;
    1126                 :         TransactionId priorXmax;
    1127                 : 
    1128                 :         /* skip unused and dead items */
    1129 GIC    14050244 :         if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
    1130 CBC        2932 :             continue;
    1131 ECB             : 
    1132 GIC    14047312 :         if (ItemIdIsNormal(lp))
    1133 ECB             :         {
    1134 GIC    14045874 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    1135 ECB             : 
    1136                 :             /*
    1137                 :              * Check if this tuple is part of a HOT-chain rooted at some other
    1138                 :              * tuple. If so, skip it for now; we'll process it when we find
    1139                 :              * its root.
    1140                 :              */
    1141 GIC    14045874 :             if (HeapTupleHeaderIsHeapOnly(htup))
    1142 CBC        1664 :                 continue;
    1143 ECB             : 
    1144                 :             /*
    1145                 :              * This is either a plain tuple or the root of a HOT-chain.
    1146                 :              * Remember it in the mapping.
    1147                 :              */
    1148 GIC    14044210 :             root_offsets[offnum - 1] = offnum;
    1149 ECB             : 
    1150                 :             /* If it's not the start of a HOT-chain, we're done with it */
    1151 GIC    14044210 :             if (!HeapTupleHeaderIsHotUpdated(htup))
    1152 CBC    14044030 :                 continue;
    1153 ECB             : 
    1154                 :             /* Set up to scan the HOT-chain */
    1155 GIC         180 :             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    1156 CBC         180 :             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    1157 ECB             :         }
    1158                 :         else
    1159                 :         {
    1160                 :             /* Must be a redirect item. We do not set its root_offsets entry */
    1161 GIC        1438 :             Assert(ItemIdIsRedirected(lp));
    1162 ECB             :             /* Set up to scan the HOT-chain */
    1163 GIC        1438 :             nextoffnum = ItemIdGetRedirect(lp);
    1164 CBC        1438 :             priorXmax = InvalidTransactionId;
    1165 ECB             :         }
    1166                 : 
    1167                 :         /*
    1168                 :          * Now follow the HOT-chain and collect other tuples in the chain.
    1169                 :          *
    1170                 :          * Note: Even though this is a nested loop, the complexity of the
    1171                 :          * function is O(N) because a tuple in the page should be visited not
    1172                 :          * more than twice, once in the outer loop and once in HOT-chain
    1173                 :          * chases.
    1174                 :          */
    1175                 :         for (;;)
    1176                 :         {
    1177                 :             /* Sanity check (pure paranoia) */
    1178 GIC        1664 :             if (offnum < FirstOffsetNumber)
    1179 LBC           0 :                 break;
    1180 EUB             : 
    1181                 :             /*
    1182                 :              * An offset past the end of page's line pointer array is possible
    1183                 :              * when the array was truncated
    1184                 :              */
    1185 GIC        1664 :             if (offnum > maxoff)
    1186 LBC           0 :                 break;
    1187 EUB             : 
    1188 GIC        1664 :             lp = PageGetItemId(page, nextoffnum);
    1189 ECB             : 
    1190                 :             /* Check for broken chains */
    1191 GIC        1664 :             if (!ItemIdIsNormal(lp))
    1192 LBC           0 :                 break;
    1193 EUB             : 
    1194 GIC        1664 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    1195 ECB             : 
    1196 GIC        1890 :             if (TransactionIdIsValid(priorXmax) &&
    1197 CBC         226 :                 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
    1198 LBC           0 :                 break;
    1199 EUB             : 
    1200                 :             /* Remember the root line pointer for this item */
    1201 GIC        1664 :             root_offsets[nextoffnum - 1] = offnum;
    1202 ECB             : 
    1203                 :             /* Advance to next chain member, if any */
    1204 GIC        1664 :             if (!HeapTupleHeaderIsHotUpdated(htup))
    1205 ECB             :                 break;
    1206                 : 
    1207                 :             /* HOT implies it can't have moved to different partition */
    1208 GIC          46 :             Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
    1209 ECB             : 
    1210 GIC          46 :             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    1211 CBC          46 :             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    1212 ECB             :         }
    1213                 :     }
    1214 GIC      204041 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a