LCOV - differential code coverage report
Current view: top level - src/backend/access/heap - heapam.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 90.4 % 3365 3043 72 88 142 20 80 1721 442 800 181 1832 41 323
Current Date: 2023-04-08 15:15:32 Functions: 97.9 % 94 92 2 76 16 2 78 14
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * heapam.c
       4                 :  *    heap access method code
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/heap/heapam.c
      12                 :  *
      13                 :  *
      14                 :  * INTERFACE ROUTINES
      15                 :  *      heap_beginscan  - begin relation scan
      16                 :  *      heap_rescan     - restart a relation scan
      17                 :  *      heap_endscan    - end relation scan
      18                 :  *      heap_getnext    - retrieve next tuple in scan
      19                 :  *      heap_fetch      - retrieve tuple with given tid
      20                 :  *      heap_insert     - insert tuple into a relation
      21                 :  *      heap_multi_insert - insert multiple tuples into a relation
      22                 :  *      heap_delete     - delete a tuple from a relation
      23                 :  *      heap_update     - replace a tuple in a relation with another tuple
      24                 :  *
      25                 :  * NOTES
      26                 :  *    This file contains the heap_ routines which implement
      27                 :  *    the POSTGRES heap access method used for all POSTGRES
      28                 :  *    relations.
      29                 :  *
      30                 :  *-------------------------------------------------------------------------
      31                 :  */
      32                 : #include "postgres.h"
      33                 : 
      34                 : #include "access/bufmask.h"
      35                 : #include "access/genam.h"
      36                 : #include "access/heapam.h"
      37                 : #include "access/heapam_xlog.h"
      38                 : #include "access/heaptoast.h"
      39                 : #include "access/hio.h"
      40                 : #include "access/multixact.h"
      41                 : #include "access/parallel.h"
      42                 : #include "access/relscan.h"
      43                 : #include "access/subtrans.h"
      44                 : #include "access/syncscan.h"
      45                 : #include "access/sysattr.h"
      46                 : #include "access/tableam.h"
      47                 : #include "access/transam.h"
      48                 : #include "access/valid.h"
      49                 : #include "access/visibilitymap.h"
      50                 : #include "access/xact.h"
      51                 : #include "access/xlog.h"
      52                 : #include "access/xloginsert.h"
      53                 : #include "access/xlogutils.h"
      54                 : #include "catalog/catalog.h"
      55                 : #include "commands/vacuum.h"
      56                 : #include "miscadmin.h"
      57                 : #include "pgstat.h"
      58                 : #include "port/atomics.h"
      59                 : #include "port/pg_bitutils.h"
      60                 : #include "storage/bufmgr.h"
      61                 : #include "storage/freespace.h"
      62                 : #include "storage/lmgr.h"
      63                 : #include "storage/predicate.h"
      64                 : #include "storage/procarray.h"
      65                 : #include "storage/smgr.h"
      66                 : #include "storage/spin.h"
      67                 : #include "storage/standby.h"
      68                 : #include "utils/datum.h"
      69                 : #include "utils/inval.h"
      70                 : #include "utils/lsyscache.h"
      71                 : #include "utils/relcache.h"
      72                 : #include "utils/snapmgr.h"
      73                 : #include "utils/spccache.h"
      74                 : 
      75                 : 
      76                 : static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
      77                 :                                      TransactionId xid, CommandId cid, int options);
      78                 : static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
      79                 :                                   Buffer newbuf, HeapTuple oldtup,
      80                 :                                   HeapTuple newtup, HeapTuple old_key_tuple,
      81                 :                                   bool all_visible_cleared, bool new_all_visible_cleared);
      82                 : static Bitmapset *HeapDetermineColumnsInfo(Relation relation,
      83                 :                                            Bitmapset *interesting_cols,
      84                 :                                            Bitmapset *external_cols,
      85                 :                                            HeapTuple oldtup, HeapTuple newtup,
      86                 :                                            bool *has_external);
      87                 : static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
      88                 :                                  LockTupleMode mode, LockWaitPolicy wait_policy,
      89                 :                                  bool *have_tuple_lock);
      90                 : static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
      91                 :                                       uint16 old_infomask2, TransactionId add_to_xmax,
      92                 :                                       LockTupleMode mode, bool is_update,
      93                 :                                       TransactionId *result_xmax, uint16 *result_infomask,
      94                 :                                       uint16 *result_infomask2);
      95                 : static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
      96                 :                                          ItemPointer ctid, TransactionId xid,
      97                 :                                          LockTupleMode mode);
      98                 : static int  heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
      99                 :                                  xl_heap_freeze_plan *plans_out,
     100                 :                                  OffsetNumber *offsets_out);
     101                 : static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
     102                 :                                    uint16 *new_infomask2);
     103                 : static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax,
     104                 :                                              uint16 t_infomask);
     105                 : static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
     106                 :                                     LockTupleMode lockmode, bool *current_is_member);
     107                 : static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
     108                 :                             Relation rel, ItemPointer ctid, XLTW_Oper oper,
     109                 :                             int *remaining);
     110                 : static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
     111                 :                                        uint16 infomask, Relation rel, int *remaining);
     112                 : static void index_delete_sort(TM_IndexDeleteOp *delstate);
     113                 : static int  bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
     114                 : static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
     115                 : static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
     116                 :                                         bool *copy);
     117                 : 
     118                 : 
     119                 : /*
     120                 :  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
     121                 :  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
     122                 :  * update them).  This table (and the macros below) helps us determine the
     123                 :  * heavyweight lock mode and MultiXactStatus values to use for any particular
     124                 :  * tuple lock strength.
     125                 :  *
     126                 :  * Don't look at lockstatus/updstatus directly!  Use get_mxact_status_for_lock
     127                 :  * instead.
     128                 :  */
     129                 : static const struct
     130                 : {
     131                 :     LOCKMODE    hwlock;
     132                 :     int         lockstatus;
     133                 :     int         updstatus;
     134                 : }
     135                 : 
     136                 :             tupleLockExtraInfo[MaxLockTupleMode + 1] =
     137                 : {
     138                 :     {                           /* LockTupleKeyShare */
     139                 :         AccessShareLock,
     140                 :         MultiXactStatusForKeyShare,
     141                 :         -1                      /* KeyShare does not allow updating tuples */
     142                 :     },
     143                 :     {                           /* LockTupleShare */
     144                 :         RowShareLock,
     145                 :         MultiXactStatusForShare,
     146                 :         -1                      /* Share does not allow updating tuples */
     147                 :     },
     148                 :     {                           /* LockTupleNoKeyExclusive */
     149                 :         ExclusiveLock,
     150                 :         MultiXactStatusForNoKeyUpdate,
     151                 :         MultiXactStatusNoKeyUpdate
     152                 :     },
     153                 :     {                           /* LockTupleExclusive */
     154                 :         AccessExclusiveLock,
     155                 :         MultiXactStatusForUpdate,
     156                 :         MultiXactStatusUpdate
     157                 :     }
     158                 : };
     159                 : 
     160                 : /* Get the LOCKMODE for a given MultiXactStatus */
     161                 : #define LOCKMODE_from_mxstatus(status) \
     162                 :             (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
     163                 : 
     164                 : /*
     165                 :  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
     166                 :  * This is more readable than having every caller translate it to lock.h's
     167                 :  * LOCKMODE.
     168                 :  */
     169                 : #define LockTupleTuplock(rel, tup, mode) \
     170                 :     LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
     171                 : #define UnlockTupleTuplock(rel, tup, mode) \
     172                 :     UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
     173                 : #define ConditionalLockTupleTuplock(rel, tup, mode) \
     174                 :     ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
     175                 : 
     176                 : #ifdef USE_PREFETCH
     177                 : /*
     178                 :  * heap_index_delete_tuples and index_delete_prefetch_buffer use this
     179                 :  * structure to coordinate prefetching activity
     180                 :  */
     181                 : typedef struct
     182                 : {
     183                 :     BlockNumber cur_hblkno;
     184                 :     int         next_item;
     185                 :     int         ndeltids;
     186                 :     TM_IndexDelete *deltids;
     187                 : } IndexDeletePrefetchState;
     188                 : #endif
     189                 : 
     190                 : /* heap_index_delete_tuples bottom-up index deletion costing constants */
     191                 : #define BOTTOMUP_MAX_NBLOCKS            6
     192                 : #define BOTTOMUP_TOLERANCE_NBLOCKS      3
     193                 : 
     194                 : /*
     195                 :  * heap_index_delete_tuples uses this when determining which heap blocks it
     196                 :  * must visit to help its bottom-up index deletion caller
     197                 :  */
     198                 : typedef struct IndexDeleteCounts
     199                 : {
     200                 :     int16       npromisingtids; /* Number of "promising" TIDs in group */
     201                 :     int16       ntids;          /* Number of TIDs in group */
     202                 :     int16       ifirsttid;      /* Offset to group's first deltid */
     203                 : } IndexDeleteCounts;
     204                 : 
     205                 : /*
     206                 :  * This table maps tuple lock strength values for each particular
     207                 :  * MultiXactStatus value.
     208                 :  */
     209                 : static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
     210                 : {
     211                 :     LockTupleKeyShare,          /* ForKeyShare */
     212                 :     LockTupleShare,             /* ForShare */
     213                 :     LockTupleNoKeyExclusive,    /* ForNoKeyUpdate */
     214                 :     LockTupleExclusive,         /* ForUpdate */
     215                 :     LockTupleNoKeyExclusive,    /* NoKeyUpdate */
     216                 :     LockTupleExclusive          /* Update */
     217                 : };
     218                 : 
     219                 : /* Get the LockTupleMode for a given MultiXactStatus */
     220                 : #define TUPLOCK_from_mxstatus(status) \
     221                 :             (MultiXactStatusLock[(status)])
     222                 : 
     223                 : /* ----------------------------------------------------------------
     224                 :  *                       heap support routines
     225                 :  * ----------------------------------------------------------------
     226                 :  */
     227                 : 
     228                 : /* ----------------
     229                 :  *      initscan - scan code common to heap_beginscan and heap_rescan
     230                 :  * ----------------
     231                 :  */
     232                 : static void
     233 GIC     1218368 : initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
     234                 : {
     235         1218368 :     ParallelBlockTableScanDesc bpscan = NULL;
     236                 :     bool        allow_strat;
     237 ECB             :     bool        allow_sync;
     238                 : 
     239                 :     /*
     240                 :      * Determine the number of blocks we have to scan.
     241                 :      *
     242                 :      * It is sufficient to do this once at scan start, since any tuples added
     243                 :      * while the scan is in progress will be invisible to my snapshot anyway.
     244                 :      * (That is not true when using a non-MVCC snapshot.  However, we couldn't
     245                 :      * guarantee to return tuples added after scan start anyway, since they
     246                 :      * might go into pages we already scanned.  To guarantee consistent
     247                 :      * results for a non-MVCC snapshot, the caller must hold some higher-level
     248                 :      * lock that ensures the interesting tuple(s) won't change.)
     249                 :      */
     250 GIC     1218368 :     if (scan->rs_base.rs_parallel != NULL)
     251                 :     {
     252            1949 :         bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
     253            1949 :         scan->rs_nblocks = bpscan->phs_nblocks;
     254 ECB             :     }
     255                 :     else
     256 CBC     1216419 :         scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd);
     257 ECB             : 
     258                 :     /*
     259                 :      * If the table is large relative to NBuffers, use a bulk-read access
     260                 :      * strategy and enable synchronized scanning (see syncscan.c).  Although
     261                 :      * the thresholds for these features could be different, we make them the
     262                 :      * same so that there are only two behaviors to tune rather than four.
     263                 :      * (However, some callers need to be able to disable one or both of these
     264                 :      * behaviors, independently of the size of the table; also there is a GUC
     265                 :      * variable that can disable synchronized scanning.)
     266                 :      *
     267                 :      * Note that table_block_parallelscan_initialize has a very similar test;
     268                 :      * if you change this, consider changing that one, too.
     269                 :      */
     270 GIC     1218367 :     if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
     271         1212618 :         scan->rs_nblocks > NBuffers / 4)
     272                 :     {
     273           10597 :         allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
     274 CBC       10597 :         allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
     275 ECB             :     }
     276                 :     else
     277 CBC     1207770 :         allow_strat = allow_sync = false;
     278 ECB             : 
     279 GIC     1218367 :     if (allow_strat)
     280                 :     {
     281 ECB             :         /* During a rescan, keep the previous strategy object. */
     282 GIC        8779 :         if (scan->rs_strategy == NULL)
     283 CBC        8703 :             scan->rs_strategy = GetAccessStrategy(BAS_BULKREAD);
     284                 :     }
     285                 :     else
     286 ECB             :     {
     287 CBC     1209588 :         if (scan->rs_strategy != NULL)
     288 UIC           0 :             FreeAccessStrategy(scan->rs_strategy);
     289 GIC     1209588 :         scan->rs_strategy = NULL;
     290                 :     }
     291 ECB             : 
     292 GBC     1218367 :     if (scan->rs_base.rs_parallel != NULL)
     293 ECB             :     {
     294                 :         /* For parallel scan, believe whatever ParallelTableScanDesc says. */
     295 GIC        1949 :         if (scan->rs_base.rs_parallel->phs_syncscan)
     296 CBC           2 :             scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
     297                 :         else
     298 GIC        1947 :             scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
     299 ECB             :     }
     300 CBC     1216418 :     else if (keep_startblock)
     301                 :     {
     302 ECB             :         /*
     303                 :          * When rescanning, we want to keep the previous startblock setting,
     304                 :          * so that rewinding a cursor doesn't generate surprising results.
     305                 :          * Reset the active syncscan setting, though.
     306                 :          */
     307 GIC      380886 :         if (allow_sync && synchronize_seqscans)
     308 UIC           0 :             scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
     309                 :         else
     310 GIC      380886 :             scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
     311 ECB             :     }
     312 GBC      835532 :     else if (allow_sync && synchronize_seqscans)
     313                 :     {
     314 CBC          63 :         scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
     315 GIC          63 :         scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
     316 ECB             :     }
     317                 :     else
     318                 :     {
     319 CBC      835469 :         scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
     320 GIC      835469 :         scan->rs_startblock = 0;
     321                 :     }
     322                 : 
     323 CBC     1218367 :     scan->rs_numblocks = InvalidBlockNumber;
     324         1218367 :     scan->rs_inited = false;
     325 GIC     1218367 :     scan->rs_ctup.t_data = NULL;
     326         1218367 :     ItemPointerSetInvalid(&scan->rs_ctup.t_self);
     327 CBC     1218367 :     scan->rs_cbuf = InvalidBuffer;
     328         1218367 :     scan->rs_cblock = InvalidBlockNumber;
     329 ECB             : 
     330                 :     /* page-at-a-time fields are always invalid when not rs_inited */
     331                 : 
     332                 :     /*
     333                 :      * copy the scan key, if appropriate
     334                 :      */
     335 GIC     1218367 :     if (key != NULL && scan->rs_base.rs_nkeys > 0)
     336          643366 :         memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
     337                 : 
     338                 :     /*
     339 ECB             :      * Currently, we only have a stats counter for sequential heap scans (but
     340                 :      * e.g for bitmap scans the underlying bitmap index scans will be counted,
     341                 :      * and for sample scans we update stats for tuple fetches).
     342                 :      */
     343 GIC     1218367 :     if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
     344         1181294 :         pgstat_count_heap_scan(scan->rs_base.rs_rd);
     345         1218367 : }
     346                 : 
     347 ECB             : /*
     348                 :  * heap_setscanlimits - restrict range of a heapscan
     349                 :  *
     350                 :  * startBlk is the page to start at
     351                 :  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
     352                 :  */
     353                 : void
     354 GIC        1821 : heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
     355                 : {
     356            1821 :     HeapScanDesc scan = (HeapScanDesc) sscan;
     357                 : 
     358 CBC        1821 :     Assert(!scan->rs_inited);    /* else too late to change */
     359                 :     /* else rs_startblock is significant */
     360            1821 :     Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
     361                 : 
     362 ECB             :     /* Check startBlk is valid (but allow case of zero blocks...) */
     363 GIC        1821 :     Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
     364 ECB             : 
     365 GIC        1821 :     scan->rs_startblock = startBlk;
     366            1821 :     scan->rs_numblocks = numBlks;
     367 CBC        1821 : }
     368                 : 
     369 ECB             : /*
     370                 :  * heapgetpage - subroutine for heapgettup()
     371                 :  *
     372                 :  * This routine reads and pins the specified page of the relation.
     373                 :  * In page-at-a-time mode it performs additional work, namely determining
     374                 :  * which tuples on the page are visible.
     375                 :  */
     376                 : void
     377 GNC     4549768 : heapgetpage(TableScanDesc sscan, BlockNumber block)
     378                 : {
     379 GIC     4549768 :     HeapScanDesc scan = (HeapScanDesc) sscan;
     380                 :     Buffer      buffer;
     381 ECB             :     Snapshot    snapshot;
     382                 :     Page        page;
     383                 :     int         lines;
     384                 :     int         ntup;
     385                 :     OffsetNumber lineoff;
     386                 :     bool        all_visible;
     387                 : 
     388 GNC     4549768 :     Assert(block < scan->rs_nblocks);
     389                 : 
     390                 :     /* release previous scan buffer, if any */
     391 CBC     4549768 :     if (BufferIsValid(scan->rs_cbuf))
     392                 :     {
     393 GIC     3660198 :         ReleaseBuffer(scan->rs_cbuf);
     394 CBC     3660198 :         scan->rs_cbuf = InvalidBuffer;
     395                 :     }
     396 ECB             : 
     397                 :     /*
     398                 :      * Be sure to check for interrupts at least once per page.  Checks at
     399                 :      * higher code levels won't be able to stop a seqscan that encounters many
     400                 :      * pages' worth of consecutive dead tuples.
     401                 :      */
     402 GIC     4549768 :     CHECK_FOR_INTERRUPTS();
     403                 : 
     404                 :     /* read page using selected strategy */
     405 GNC     4549767 :     scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, block,
     406                 :                                        RBM_NORMAL, scan->rs_strategy);
     407         4549767 :     scan->rs_cblock = block;
     408 ECB             : 
     409 GIC     4549767 :     if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
     410 CBC       87463 :         return;
     411                 : 
     412         4462304 :     buffer = scan->rs_cbuf;
     413         4462304 :     snapshot = scan->rs_base.rs_snapshot;
     414                 : 
     415 ECB             :     /*
     416                 :      * Prune and repair fragmentation for the whole page, if possible.
     417                 :      */
     418 GIC     4462304 :     heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
     419                 : 
     420                 :     /*
     421 ECB             :      * We must hold share lock on the buffer content while examining tuple
     422                 :      * visibility.  Afterwards, however, the tuples we have found to be
     423                 :      * visible are guaranteed good as long as we hold the buffer pin.
     424                 :      */
     425 GIC     4462304 :     LockBuffer(buffer, BUFFER_LOCK_SHARE);
     426                 : 
     427 GNC     4462304 :     page = BufferGetPage(buffer);
     428         4462304 :     TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, page);
     429         4462302 :     lines = PageGetMaxOffsetNumber(page);
     430 CBC     4462302 :     ntup = 0;
     431 ECB             : 
     432                 :     /*
     433                 :      * If the all-visible flag indicates that all tuples on the page are
     434                 :      * visible to everyone, we can skip the per-tuple visibility tests.
     435                 :      *
     436                 :      * Note: In hot standby, a tuple that's already visible to all
     437                 :      * transactions on the primary might still be invisible to a read-only
     438                 :      * transaction in the standby. We partly handle this problem by tracking
     439                 :      * the minimum xmin of visible tuples as the cut-off XID while marking a
     440                 :      * page all-visible on the primary and WAL log that along with the
     441                 :      * visibility map SET operation. In hot standby, we wait for (or abort)
     442                 :      * all transactions that can potentially may not see one or more tuples on
     443                 :      * the page. That's how index-only scans work fine in hot standby. A
     444                 :      * crucial difference between index-only scans and heap scans is that the
     445                 :      * index-only scan completely relies on the visibility map where as heap
     446                 :      * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
     447                 :      * the page-level flag can be trusted in the same way, because it might
     448                 :      * get propagated somehow without being explicitly WAL-logged, e.g. via a
     449                 :      * full page write. Until we can prove that beyond doubt, let's check each
     450                 :      * tuple for visibility the hard way.
     451                 :      */
     452 GNC     4462302 :     all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
     453                 : 
     454       225462979 :     for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++)
     455 ECB             :     {
     456 GNC   221000685 :         ItemId      lpp = PageGetItemId(page, lineoff);
     457                 :         HeapTupleData loctup;
     458                 :         bool        valid;
     459                 : 
     460       221000685 :         if (!ItemIdIsNormal(lpp))
     461        13109416 :             continue;
     462 ECB             : 
     463 GNC   207891269 :         loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
     464       207891269 :         loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp);
     465       207891269 :         loctup.t_len = ItemIdGetLength(lpp);
     466       207891269 :         ItemPointerSet(&(loctup.t_self), block, lineoff);
     467 ECB             : 
     468 GNC   207891269 :         if (all_visible)
     469        23568478 :             valid = true;
     470                 :         else
     471       184322791 :             valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
     472                 : 
     473       207891269 :         HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
     474                 :                                             &loctup, buffer, snapshot);
     475                 : 
     476       207891261 :         if (valid)
     477       202263456 :             scan->rs_vistuples[ntup++] = lineoff;
     478 ECB             :     }
     479                 : 
     480 GIC     4462294 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
     481                 : 
     482 CBC     4462294 :     Assert(ntup <= MaxHeapTuplesPerPage);
     483 GIC     4462294 :     scan->rs_ntuples = ntup;
     484 ECB             : }
     485                 : 
     486                 : /*
     487                 :  * heapgettup_initial_block - return the first BlockNumber to scan
     488                 :  *
     489                 :  * Returns InvalidBlockNumber when there are no blocks to scan.  This can
     490                 :  * occur with empty tables and in parallel scans when parallel workers get all
     491                 :  * of the pages before we can get a chance to get our first page.
     492                 :  */
     493                 : static BlockNumber
     494 GNC     1180761 : heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
     495                 : {
     496         1180761 :     Assert(!scan->rs_inited);
     497                 : 
     498                 :     /* When there are no pages to scan, return InvalidBlockNumber */
     499         1180761 :     if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
     500          290783 :         return InvalidBlockNumber;
     501                 : 
     502          889978 :     if (ScanDirectionIsForward(dir))
     503                 :     {
     504                 :         /* serial scan */
     505          889947 :         if (scan->rs_base.rs_parallel == NULL)
     506          888651 :             return scan->rs_startblock;
     507                 :         else
     508                 :         {
     509                 :             /* parallel scan */
     510            1296 :             table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
     511            1296 :                                                      scan->rs_parallelworkerdata,
     512            1296 :                                                      (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
     513                 : 
     514                 :             /* may return InvalidBlockNumber if there are no more blocks */
     515            1296 :             return table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
     516            1296 :                                                      scan->rs_parallelworkerdata,
     517            1296 :                                                      (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel);
     518                 :         }
     519                 :     }
     520                 :     else
     521                 :     {
     522                 :         /* backward parallel scan not supported */
     523              31 :         Assert(scan->rs_base.rs_parallel == NULL);
     524                 : 
     525                 :         /*
     526                 :          * Disable reporting to syncscan logic in a backwards scan; it's not
     527                 :          * very likely anyone else is doing the same thing at the same time,
     528                 :          * and much more likely that we'll just bollix things for forward
     529                 :          * scanners.
     530                 :          */
     531              31 :         scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
     532                 : 
     533                 :         /*
     534                 :          * Start from last page of the scan.  Ensure we take into account
     535                 :          * rs_numblocks if it's been adjusted by heap_setscanlimits().
     536                 :          */
     537              31 :         if (scan->rs_numblocks != InvalidBlockNumber)
     538               3 :             return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
     539                 : 
     540              28 :         if (scan->rs_startblock > 0)
     541 UNC           0 :             return scan->rs_startblock - 1;
     542                 : 
     543 GNC          28 :         return scan->rs_nblocks - 1;
     544                 :     }
     545                 : }
     546                 : 
     547                 : 
     548                 : /*
     549                 :  * heapgettup_start_page - helper function for heapgettup()
     550                 :  *
     551                 :  * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
     552                 :  * to the number of tuples on this page.  Also set *lineoff to the first
     553                 :  * offset to scan with forward scans getting the first offset and backward
     554                 :  * getting the final offset on the page.
     555                 :  */
     556                 : static Page
     557           85369 : heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
     558                 :                       OffsetNumber *lineoff)
     559                 : {
     560                 :     Page        page;
     561                 : 
     562           85369 :     Assert(scan->rs_inited);
     563           85369 :     Assert(BufferIsValid(scan->rs_cbuf));
     564                 : 
     565                 :     /* Caller is responsible for ensuring buffer is locked if needed */
     566           85369 :     page = BufferGetPage(scan->rs_cbuf);
     567                 : 
     568           85369 :     TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
     569                 : 
     570           85369 :     *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1;
     571                 : 
     572           85369 :     if (ScanDirectionIsForward(dir))
     573           85369 :         *lineoff = FirstOffsetNumber;
     574                 :     else
     575 UNC           0 :         *lineoff = (OffsetNumber) (*linesleft);
     576                 : 
     577                 :     /* lineoff now references the physically previous or next tid */
     578 GNC       85369 :     return page;
     579                 : }
     580                 : 
     581                 : 
     582                 : /*
     583                 :  * heapgettup_continue_page - helper function for heapgettup()
     584                 :  *
     585                 :  * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
     586                 :  * to the number of tuples left to scan on this page.  Also set *lineoff to
     587                 :  * the next offset to scan according to the ScanDirection in 'dir'.
     588                 :  */
     589                 : static inline Page
     590         7503635 : heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft,
     591                 :                          OffsetNumber *lineoff)
     592                 : {
     593                 :     Page        page;
     594                 : 
     595         7503635 :     Assert(scan->rs_inited);
     596         7503635 :     Assert(BufferIsValid(scan->rs_cbuf));
     597                 : 
     598                 :     /* Caller is responsible for ensuring buffer is locked if needed */
     599         7503635 :     page = BufferGetPage(scan->rs_cbuf);
     600                 : 
     601         7503635 :     TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
     602                 : 
     603         7503635 :     if (ScanDirectionIsForward(dir))
     604                 :     {
     605         7503635 :         *lineoff = OffsetNumberNext(scan->rs_coffset);
     606         7503635 :         *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
     607                 :     }
     608                 :     else
     609                 :     {
     610                 :         /*
     611                 :          * The previous returned tuple may have been vacuumed since the
     612                 :          * previous scan when we use a non-MVCC snapshot, so we must
     613                 :          * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
     614                 :          */
     615 UNC           0 :         *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset));
     616               0 :         *linesleft = *lineoff;
     617                 :     }
     618                 : 
     619                 :     /* lineoff now references the physically previous or next tid */
     620 GNC     7503635 :     return page;
     621                 : }
     622                 : 
     623                 : /*
     624                 :  * heapgettup_advance_block - helper for heapgettup() and heapgettup_pagemode()
     625                 :  *
     626                 :  * Given the current block number, the scan direction, and various information
     627                 :  * contained in the scan descriptor, calculate the BlockNumber to scan next
     628                 :  * and return it.  If there are no further blocks to scan, return
     629                 :  * InvalidBlockNumber to indicate this fact to the caller.
     630                 :  *
     631                 :  * This should not be called to determine the initial block number -- only for
     632                 :  * subsequent blocks.
     633                 :  *
     634                 :  * This also adjusts rs_numblocks when a limit has been imposed by
     635                 :  * heap_setscanlimits().
     636                 :  */
     637                 : static inline BlockNumber
     638         4091735 : heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
     639                 : {
     640         4091735 :     if (ScanDirectionIsForward(dir))
     641                 :     {
     642         4091690 :         if (scan->rs_base.rs_parallel == NULL)
     643                 :         {
     644         3995847 :             block++;
     645                 : 
     646                 :             /* wrap back to the start of the heap */
     647         3995847 :             if (block >= scan->rs_nblocks)
     648          435528 :                 block = 0;
     649                 : 
     650                 :             /* we're done if we're back at where we started */
     651         3995847 :             if (block == scan->rs_startblock)
     652          435487 :                 return InvalidBlockNumber;
     653                 : 
     654                 :             /* check if the limit imposed by heap_setscanlimits() is met */
     655         3560360 :             if (scan->rs_numblocks != InvalidBlockNumber)
     656                 :             {
     657            1551 :                 if (--scan->rs_numblocks == 0)
     658            1487 :                     return InvalidBlockNumber;
     659                 :             }
     660                 : 
     661                 :             /*
     662                 :              * Report our new scan position for synchronization purposes. We
     663                 :              * don't do that when moving backwards, however. That would just
     664                 :              * mess up any other forward-moving scanners.
     665                 :              *
     666                 :              * Note: we do this before checking for end of scan so that the
     667                 :              * final state of the position hint is back at the start of the
     668                 :              * rel.  That's not strictly necessary, but otherwise when you run
     669                 :              * the same query multiple times the starting position would shift
     670                 :              * a little bit backwards on every invocation, which is confusing.
     671                 :              * We don't guarantee any specific ordering in general, though.
     672                 :              */
     673         3558873 :             if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
     674           13162 :                 ss_report_location(scan->rs_base.rs_rd, block);
     675                 : 
     676         3558873 :             return block;
     677                 :         }
     678                 :         else
     679                 :         {
     680           95843 :             return table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
     681           95843 :                                                      scan->rs_parallelworkerdata, (ParallelBlockTableScanDesc)
     682           95843 :                                                      scan->rs_base.rs_parallel);
     683                 :         }
     684                 :     }
     685                 :     else
     686                 :     {
     687                 :         /* we're done if the last block is the start position */
     688              45 :         if (block == scan->rs_startblock)
     689              45 :             return InvalidBlockNumber;
     690                 : 
     691                 :         /* check if the limit imposed by heap_setscanlimits() is met */
     692 UNC           0 :         if (scan->rs_numblocks != InvalidBlockNumber)
     693                 :         {
     694               0 :             if (--scan->rs_numblocks == 0)
     695               0 :                 return InvalidBlockNumber;
     696                 :         }
     697                 : 
     698                 :         /* wrap to the end of the heap when the last page was page 0 */
     699               0 :         if (block == 0)
     700               0 :             block = scan->rs_nblocks;
     701                 : 
     702               0 :         block--;
     703                 : 
     704               0 :         return block;
     705                 :     }
     706                 : }
     707                 : 
     708                 : /* ----------------
     709                 :  *      heapgettup - fetch next heap tuple
     710                 :  *
     711                 :  *      Initialize the scan if not already done; then advance to the next
     712                 :  *      tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
     713                 :  *      or set scan->rs_ctup.t_data = NULL if no more tuples.
     714                 :  *
     715 ECB             :  * Note: the reason nkeys/key are passed separately, even though they are
     716                 :  * kept in the scan descriptor, is that the caller may not want us to check
     717                 :  * the scankeys.
     718                 :  *
     719                 :  * Note: when we fall off the end of the scan in either direction, we
     720                 :  * reset rs_inited.  This means that a further request with the same
     721                 :  * scan direction will restart the scan, which is a bit odd, but a
     722                 :  * request with the opposite scan direction will start a fresh scan
     723                 :  * in the proper direction.  The latter is required behavior for cursors,
     724                 :  * while the former case is generally undefined behavior in Postgres
     725                 :  * so we don't care too much.
     726                 :  * ----------------
     727                 :  */
     728                 : static void
     729 GIC     7521441 : heapgettup(HeapScanDesc scan,
     730                 :            ScanDirection dir,
     731 ECB             :            int nkeys,
     732                 :            ScanKey key)
     733                 : {
     734 GIC     7521441 :     HeapTuple   tuple = &(scan->rs_ctup);
     735                 :     BlockNumber block;
     736                 :     Page        page;
     737                 :     OffsetNumber lineoff;
     738                 :     int         linesleft;
     739 ECB             : 
     740 GNC     7521441 :     if (unlikely(!scan->rs_inited))
     741                 :     {
     742           17806 :         block = heapgettup_initial_block(scan, dir);
     743                 :         /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
     744           17806 :         Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
     745           17806 :         scan->rs_inited = true;
     746                 :     }
     747                 :     else
     748 ECB             :     {
     749                 :         /* continue from previously returned page/tuple */
     750 GNC     7503635 :         block = scan->rs_cblock;
     751                 : 
     752         7503635 :         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
     753         7503635 :         page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
     754         7503635 :         goto continue_page;
     755 ECB             :     }
     756                 : 
     757                 :     /*
     758                 :      * advance the scan until we find a qualifying tuple or run out of stuff
     759                 :      * to scan
     760                 :      */
     761 GNC      103028 :     while (block != InvalidBlockNumber)
     762 ECB             :     {
     763 GNC       85369 :         heapgetpage((TableScanDesc) scan, block);
     764           85369 :         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
     765           85369 :         page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
     766         7589004 : continue_page:
     767                 : 
     768                 :         /*
     769                 :          * Only continue scanning the page while we have lines left.
     770 EUB             :          *
     771                 :          * Note that this protects us from accessing line pointers past
     772                 :          * PageGetMaxOffsetNumber(); both for forward scans when we resume the
     773                 :          * table scan, and for when we start scanning a new page.
     774                 :          */
     775 GNC     7623853 :         for (; linesleft > 0; linesleft--, lineoff += dir)
     776                 :         {
     777                 :             bool        visible;
     778         7538631 :             ItemId      lpp = PageGetItemId(page, lineoff);
     779 EUB             : 
     780 GNC     7538631 :             if (!ItemIdIsNormal(lpp))
     781           29667 :                 continue;
     782                 : 
     783         7508964 :             tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
     784         7508964 :             tuple->t_len = ItemIdGetLength(lpp);
     785         7508964 :             ItemPointerSet(&(tuple->t_self), block, lineoff);
     786                 : 
     787         7508964 :             visible = HeapTupleSatisfiesVisibility(tuple,
     788                 :                                                    scan->rs_base.rs_snapshot,
     789                 :                                                    scan->rs_cbuf);
     790                 : 
     791         7508964 :             HeapCheckForSerializableConflictOut(visible, scan->rs_base.rs_rd,
     792                 :                                                 tuple, scan->rs_cbuf,
     793                 :                                                 scan->rs_base.rs_snapshot);
     794                 : 
     795                 :             /* skip tuples not visible to this snapshot */
     796         7508964 :             if (!visible)
     797            5182 :                 continue;
     798                 : 
     799                 :             /* skip any tuples that don't match the scan key */
     800         7503782 :             if (key != NULL &&
     801 UNC           0 :                 !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
     802                 :                              nkeys, key))
     803               0 :                 continue;
     804                 : 
     805 GNC     7503782 :             LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
     806         7503782 :             scan->rs_coffset = lineoff;
     807         7503782 :             return;
     808                 :         }
     809 ECB             : 
     810                 :         /*
     811                 :          * if we get here, it means we've exhausted the items on this page and
     812                 :          * it's time to move to the next.
     813                 :          */
     814 GIC       85222 :         LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
     815 ECB             : 
     816                 :         /* get the BlockNumber to scan next */
     817 GNC       85222 :         block = heapgettup_advance_block(scan, block, dir);
     818                 :     }
     819                 : 
     820                 :     /* end of scan */
     821           17659 :     if (BufferIsValid(scan->rs_cbuf))
     822            3811 :         ReleaseBuffer(scan->rs_cbuf);
     823                 : 
     824           17659 :     scan->rs_cbuf = InvalidBuffer;
     825           17659 :     scan->rs_cblock = InvalidBlockNumber;
     826           17659 :     tuple->t_data = NULL;
     827           17659 :     scan->rs_inited = false;
     828                 : }
     829                 : 
     830                 : /* ----------------
     831                 :  *      heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
     832                 :  *
     833                 :  *      Same API as heapgettup, but used in page-at-a-time mode
     834                 :  *
     835                 :  * The internal logic is much the same as heapgettup's too, but there are some
     836                 :  * differences: we do not take the buffer content lock (that only needs to
     837                 :  * happen inside heapgetpage), and we iterate through just the tuples listed
     838                 :  * in rs_vistuples[] rather than all tuples on the page.  Notice that
     839                 :  * lineindex is 0-based, where the corresponding loop variable lineoff in
     840                 :  * heapgettup is 1-based.
     841                 :  * ----------------
     842 ECB             :  */
     843                 : static void
     844 GIC    49060348 : heapgettup_pagemode(HeapScanDesc scan,
     845                 :                     ScanDirection dir,
     846                 :                     int nkeys,
     847 ECB             :                     ScanKey key)
     848                 : {
     849 GIC    49060348 :     HeapTuple   tuple = &(scan->rs_ctup);
     850                 :     BlockNumber block;
     851                 :     Page        page;
     852 ECB             :     int         lineindex;
     853                 :     int         linesleft;
     854                 : 
     855 GNC    49060348 :     if (unlikely(!scan->rs_inited))
     856 ECB             :     {
     857 GNC     1162955 :         block = heapgettup_initial_block(scan, dir);
     858                 :         /* ensure rs_cbuf is invalid when we get InvalidBlockNumber */
     859         1162955 :         Assert(block != InvalidBlockNumber || !BufferIsValid(scan->rs_cbuf));
     860         1162955 :         scan->rs_inited = true;
     861                 :     }
     862                 :     else
     863                 :     {
     864                 :         /* continue from previously returned page/tuple */
     865        47897393 :         block = scan->rs_cblock; /* current page */
     866        47897393 :         page = BufferGetPage(scan->rs_cbuf);
     867        47897393 :         TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
     868                 : 
     869        47897393 :         lineindex = scan->rs_cindex + dir;
     870        47897393 :         if (ScanDirectionIsForward(dir))
     871        47897065 :             linesleft = scan->rs_ntuples - lineindex;
     872                 :         else
     873             328 :             linesleft = scan->rs_cindex;
     874                 :         /* lineindex now references the next or previous visible tid */
     875                 : 
     876        47897393 :         goto continue_page;
     877                 :     }
     878                 : 
     879 ECB             :     /*
     880                 :      * advance the scan until we find a qualifying tuple or run out of stuff
     881                 :      * to scan
     882                 :      */
     883 GNC     5169468 :     while (block != InvalidBlockNumber)
     884 ECB             :     {
     885 GNC     4458029 :         heapgetpage((TableScanDesc) scan, block);
     886         4458018 :         page = BufferGetPage(scan->rs_cbuf);
     887         4458018 :         TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, page);
     888         4458018 :         linesleft = scan->rs_ntuples;
     889         4458018 :         lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1;
     890                 : 
     891                 :         /* lineindex now references the next or previous visible tid */
     892        52355411 : continue_page:
     893                 : 
     894       196997813 :         for (; linesleft > 0; linesleft--, lineindex += dir)
     895 ECB             :         {
     896                 :             ItemId      lpp;
     897                 :             OffsetNumber lineoff;
     898                 : 
     899 GIC   192991300 :             lineoff = scan->rs_vistuples[lineindex];
     900 GNC   192991300 :             lpp = PageGetItemId(page, lineoff);
     901 GIC   192991300 :             Assert(ItemIdIsNormal(lpp));
     902 ECB             : 
     903 GNC   192991300 :             tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
     904 GIC   192991300 :             tuple->t_len = ItemIdGetLength(lpp);
     905 GNC   192991300 :             ItemPointerSet(&(tuple->t_self), block, lineoff);
     906                 : 
     907                 :             /* skip any tuples that don't match the scan key */
     908       192991300 :             if (key != NULL &&
     909       145561268 :                 !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
     910                 :                              nkeys, key))
     911       144642402 :                 continue;
     912                 : 
     913        48348898 :             scan->rs_cindex = lineindex;
     914 GBC    48348898 :             return;
     915                 :         }
     916                 : 
     917                 :         /* get the BlockNumber to scan next */
     918 GNC     4006513 :         block = heapgettup_advance_block(scan, block, dir);
     919                 :     }
     920                 : 
     921                 :     /* end of scan */
     922          711439 :     if (BufferIsValid(scan->rs_cbuf))
     923          434021 :         ReleaseBuffer(scan->rs_cbuf);
     924          711439 :     scan->rs_cbuf = InvalidBuffer;
     925          711439 :     scan->rs_cblock = InvalidBlockNumber;
     926          711439 :     tuple->t_data = NULL;
     927          711439 :     scan->rs_inited = false;
     928                 : }
     929                 : 
     930                 : 
     931                 : /* ----------------------------------------------------------------
     932                 :  *                   heap access method interface
     933 ECB             :  * ----------------------------------------------------------------
     934                 :  */
     935                 : 
     936                 : 
     937                 : TableScanDesc
     938 GIC      837428 : heap_beginscan(Relation relation, Snapshot snapshot,
     939 ECB             :                int nkeys, ScanKey key,
     940                 :                ParallelTableScanDesc parallel_scan,
     941                 :                uint32 flags)
     942                 : {
     943                 :     HeapScanDesc scan;
     944                 : 
     945                 :     /*
     946                 :      * increment relation ref count while scanning relation
     947                 :      *
     948                 :      * This is just to make really sure the relcache entry won't go away while
     949                 :      * the scan has a pointer to it.  Caller should be holding the rel open
     950                 :      * anyway, so this is redundant in all normal scenarios...
     951                 :      */
     952 CBC      837428 :     RelationIncrementReferenceCount(relation);
     953 ECB             : 
     954                 :     /*
     955                 :      * allocate and initialize scan descriptor
     956                 :      */
     957 GIC      837428 :     scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
     958                 : 
     959          837428 :     scan->rs_base.rs_rd = relation;
     960          837428 :     scan->rs_base.rs_snapshot = snapshot;
     961 CBC      837428 :     scan->rs_base.rs_nkeys = nkeys;
     962 GIC      837428 :     scan->rs_base.rs_flags = flags;
     963 CBC      837428 :     scan->rs_base.rs_parallel = parallel_scan;
     964 GIC      837428 :     scan->rs_strategy = NULL;    /* set in initscan */
     965 ECB             : 
     966                 :     /*
     967                 :      * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
     968                 :      */
     969 CBC      837428 :     if (!(snapshot && IsMVCCSnapshot(snapshot)))
     970 GIC       42063 :         scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
     971                 : 
     972 ECB             :     /*
     973                 :      * For seqscan and sample scans in a serializable transaction, acquire a
     974                 :      * predicate lock on the entire relation. This is required not only to
     975                 :      * lock all the matching tuples, but also to conflict with new insertions
     976                 :      * into the table. In an indexscan, we take page locks on the index pages
     977                 :      * covering the range specified in the scan qual, but in a heap scan there
     978                 :      * is nothing more fine-grained to lock. A bitmap scan is a different
     979                 :      * story, there we have already scanned the index and locked the index
     980                 :      * pages covering the predicate. But in that case we still have to lock
     981                 :      * any matching heap tuples. For sample scan we could optimize the locking
     982                 :      * to be at least page-level granularity, but we'd need to add per-tuple
     983                 :      * locking for that.
     984                 :      */
     985 GIC      837428 :     if (scan->rs_base.rs_flags & (SO_TYPE_SEQSCAN | SO_TYPE_SAMPLESCAN))
     986                 :     {
     987                 :         /*
     988                 :          * Ensure a missing snapshot is noticed reliably, even if the
     989                 :          * isolation mode means predicate locking isn't performed (and
     990 ECB             :          * therefore the snapshot isn't used here).
     991                 :          */
     992 GIC      802194 :         Assert(snapshot);
     993          802194 :         PredicateLockRelation(relation, snapshot);
     994                 :     }
     995                 : 
     996                 :     /* we only need to set this up once */
     997          837428 :     scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
     998 ECB             : 
     999                 :     /*
    1000                 :      * Allocate memory to keep track of page allocation for parallel workers
    1001                 :      * when doing a parallel scan.
    1002                 :      */
    1003 GIC      837428 :     if (parallel_scan != NULL)
    1004            1895 :         scan->rs_parallelworkerdata = palloc(sizeof(ParallelBlockTableScanWorkerData));
    1005 ECB             :     else
    1006 CBC      835533 :         scan->rs_parallelworkerdata = NULL;
    1007                 : 
    1008                 :     /*
    1009                 :      * we do this here instead of in initscan() because heap_rescan also calls
    1010                 :      * initscan() and we don't want to allocate memory again
    1011                 :      */
    1012          837428 :     if (nkeys > 0)
    1013 GIC      643366 :         scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
    1014                 :     else
    1015 CBC      194062 :         scan->rs_base.rs_key = NULL;
    1016 ECB             : 
    1017 GIC      837428 :     initscan(scan, key, false);
    1018                 : 
    1019          837427 :     return (TableScanDesc) scan;
    1020                 : }
    1021                 : 
    1022                 : void
    1023          380940 : heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
    1024                 :             bool allow_strat, bool allow_sync, bool allow_pagemode)
    1025                 : {
    1026          380940 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1027                 : 
    1028 CBC      380940 :     if (set_params)
    1029                 :     {
    1030              15 :         if (allow_strat)
    1031              15 :             scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
    1032                 :         else
    1033 UIC           0 :             scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
    1034 ECB             : 
    1035 GIC          15 :         if (allow_sync)
    1036               6 :             scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
    1037 ECB             :         else
    1038 CBC           9 :             scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
    1039                 : 
    1040 GIC          15 :         if (allow_pagemode && scan->rs_base.rs_snapshot &&
    1041              15 :             IsMVCCSnapshot(scan->rs_base.rs_snapshot))
    1042 CBC          15 :             scan->rs_base.rs_flags |= SO_ALLOW_PAGEMODE;
    1043                 :         else
    1044 UIC           0 :             scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
    1045 ECB             :     }
    1046                 : 
    1047                 :     /*
    1048                 :      * unpin scan buffers
    1049                 :      */
    1050 GIC      380940 :     if (BufferIsValid(scan->rs_cbuf))
    1051            2374 :         ReleaseBuffer(scan->rs_cbuf);
    1052 ECB             : 
    1053                 :     /*
    1054                 :      * reinitialize scan descriptor
    1055 EUB             :      */
    1056 GIC      380940 :     initscan(scan, key, true);
    1057 CBC      380940 : }
    1058                 : 
    1059 ECB             : void
    1060 CBC      836289 : heap_endscan(TableScanDesc sscan)
    1061                 : {
    1062 GIC      836289 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1063                 : 
    1064                 :     /* Note: no locking manipulations needed */
    1065                 : 
    1066                 :     /*
    1067                 :      * unpin scan buffers
    1068                 :      */
    1069 CBC      836289 :     if (BufferIsValid(scan->rs_cbuf))
    1070 GIC      455999 :         ReleaseBuffer(scan->rs_cbuf);
    1071 ECB             : 
    1072                 :     /*
    1073                 :      * decrement relation reference count and free scan descriptor storage
    1074                 :      */
    1075 GIC      836289 :     RelationDecrementReferenceCount(scan->rs_base.rs_rd);
    1076                 : 
    1077          836289 :     if (scan->rs_base.rs_key)
    1078 CBC      643338 :         pfree(scan->rs_base.rs_key);
    1079 EUB             : 
    1080 GIC      836289 :     if (scan->rs_strategy != NULL)
    1081 CBC        8694 :         FreeAccessStrategy(scan->rs_strategy);
    1082                 : 
    1083 GIC      836289 :     if (scan->rs_parallelworkerdata != NULL)
    1084            1895 :         pfree(scan->rs_parallelworkerdata);
    1085                 : 
    1086          836289 :     if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
    1087          126389 :         UnregisterSnapshot(scan->rs_base.rs_snapshot);
    1088 ECB             : 
    1089 GIC      836289 :     pfree(scan);
    1090 CBC      836289 : }
    1091                 : 
    1092                 : HeapTuple
    1093 GIC    14659596 : heap_getnext(TableScanDesc sscan, ScanDirection direction)
    1094                 : {
    1095        14659596 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1096                 : 
    1097 ECB             :     /*
    1098                 :      * This is still widely used directly, without going through table AM, so
    1099 EUB             :      * add a safety check.  It's possible we should, at a later point,
    1100                 :      * downgrade this to an assert. The reason for checking the AM routine,
    1101                 :      * rather than the AM oid, is that this allows to write regression tests
    1102 ECB             :      * that create another AM reusing the heap handler.
    1103                 :      */
    1104 GIC    14659596 :     if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
    1105 UIC           0 :         ereport(ERROR,
    1106                 :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
    1107                 :                  errmsg_internal("only heap AM is supported")));
    1108                 : 
    1109 ECB             :     /*
    1110                 :      * We don't expect direct calls to heap_getnext with valid CheckXidAlive
    1111                 :      * for catalog or regular tables.  See detailed comments in xact.c where
    1112                 :      * these variables are declared.  Normally we have such a check at tableam
    1113                 :      * level API but this is called from many places so we need to ensure it
    1114                 :      * here.
    1115                 :      */
    1116 GIC    14659596 :     if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
    1117 UIC           0 :         elog(ERROR, "unexpected heap_getnext call during logical decoding");
    1118                 : 
    1119                 :     /* Note: no locking manipulations needed */
    1120                 : 
    1121 GIC    14659596 :     if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
    1122         7639618 :         heapgettup_pagemode(scan, direction,
    1123         7639618 :                             scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
    1124                 :     else
    1125         7019978 :         heapgettup(scan, direction,
    1126         7019978 :                    scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
    1127                 : 
    1128        14659596 :     if (scan->rs_ctup.t_data == NULL)
    1129           92788 :         return NULL;
    1130                 : 
    1131                 :     /*
    1132                 :      * if we get here it means we have a new current scan tuple, so point to
    1133                 :      * the proper return buffer and return the tuple.
    1134                 :      */
    1135                 : 
    1136        14566808 :     pgstat_count_heap_getnext(scan->rs_base.rs_rd);
    1137                 : 
    1138        14566808 :     return &scan->rs_ctup;
    1139                 : }
    1140                 : 
    1141                 : bool
    1142        41919130 : heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
    1143                 : {
    1144        41919130 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1145                 : 
    1146                 :     /* Note: no locking manipulations needed */
    1147                 : 
    1148        41919130 :     if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
    1149 CBC    41417667 :         heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
    1150                 :     else
    1151 GIC      501463 :         heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
    1152                 : 
    1153        41919119 :     if (scan->rs_ctup.t_data == NULL)
    1154                 :     {
    1155 CBC      636263 :         ExecClearTuple(slot);
    1156 GIC      636263 :         return false;
    1157                 :     }
    1158                 : 
    1159                 :     /*
    1160                 :      * if we get here it means we have a new current scan tuple, so point to
    1161                 :      * the proper return buffer and return the tuple.
    1162                 :      */
    1163                 : 
    1164        41282856 :     pgstat_count_heap_getnext(scan->rs_base.rs_rd);
    1165 ECB             : 
    1166 GIC    41282856 :     ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
    1167                 :                              scan->rs_cbuf);
    1168        41282856 :     return true;
    1169                 : }
    1170 ECB             : 
    1171                 : void
    1172 CBC          89 : heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid,
    1173                 :                   ItemPointer maxtid)
    1174                 : {
    1175 GIC          89 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1176                 :     BlockNumber startBlk;
    1177                 :     BlockNumber numBlks;
    1178 ECB             :     ItemPointerData highestItem;
    1179                 :     ItemPointerData lowestItem;
    1180                 : 
    1181 EUB             :     /*
    1182                 :      * For relations without any pages, we can simply leave the TID range
    1183                 :      * unset.  There will be no tuples to scan, therefore no tuples outside
    1184                 :      * the given TID range.
    1185                 :      */
    1186 GIC          89 :     if (scan->rs_nblocks == 0)
    1187              24 :         return;
    1188                 : 
    1189                 :     /*
    1190                 :      * Set up some ItemPointers which point to the first and last possible
    1191 ECB             :      * tuples in the heap.
    1192                 :      */
    1193 GIC          83 :     ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
    1194              83 :     ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
    1195                 : 
    1196 ECB             :     /*
    1197                 :      * If the given maximum TID is below the highest possible TID in the
    1198                 :      * relation, then restrict the range to that, otherwise we scan to the end
    1199                 :      * of the relation.
    1200                 :      */
    1201 CBC          83 :     if (ItemPointerCompare(maxtid, &highestItem) < 0)
    1202              66 :         ItemPointerCopy(maxtid, &highestItem);
    1203                 : 
    1204                 :     /*
    1205                 :      * If the given minimum TID is above the lowest possible TID in the
    1206                 :      * relation, then restrict the range to only scan for TIDs above that.
    1207                 :      */
    1208              83 :     if (ItemPointerCompare(mintid, &lowestItem) > 0)
    1209              26 :         ItemPointerCopy(mintid, &lowestItem);
    1210 ECB             : 
    1211                 :     /*
    1212                 :      * Check for an empty range and protect from would be negative results
    1213                 :      * from the numBlks calculation below.
    1214                 :      */
    1215 CBC          83 :     if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
    1216                 :     {
    1217 ECB             :         /* Set an empty range of blocks to scan */
    1218 CBC          18 :         heap_setscanlimits(sscan, 0, 0);
    1219              18 :         return;
    1220                 :     }
    1221 ECB             : 
    1222                 :     /*
    1223                 :      * Calculate the first block and the number of blocks we must scan. We
    1224                 :      * could be more aggressive here and perform some more validation to try
    1225                 :      * and further narrow the scope of blocks to scan by checking if the
    1226                 :      * lowerItem has an offset above MaxOffsetNumber.  In this case, we could
    1227                 :      * advance startBlk by one.  Likewise, if highestItem has an offset of 0
    1228                 :      * we could scan one fewer blocks.  However, such an optimization does not
    1229                 :      * seem worth troubling over, currently.
    1230                 :      */
    1231 CBC          65 :     startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
    1232                 : 
    1233              65 :     numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
    1234 GIC          65 :         ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
    1235                 : 
    1236                 :     /* Set the start block and number of blocks to scan */
    1237 CBC          65 :     heap_setscanlimits(sscan, startBlk, numBlks);
    1238 ECB             : 
    1239                 :     /* Finally, set the TID range in sscan */
    1240 GIC          65 :     ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
    1241 CBC          65 :     ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
    1242 ECB             : }
    1243                 : 
    1244                 : bool
    1245 GIC        2970 : heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction,
    1246 ECB             :                           TupleTableSlot *slot)
    1247                 : {
    1248 GIC        2970 :     HeapScanDesc scan = (HeapScanDesc) sscan;
    1249            2970 :     ItemPointer mintid = &sscan->rs_mintid;
    1250            2970 :     ItemPointer maxtid = &sscan->rs_maxtid;
    1251                 : 
    1252                 :     /* Note: no locking manipulations needed */
    1253                 :     for (;;)
    1254                 :     {
    1255            3063 :         if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
    1256            3063 :             heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
    1257                 :         else
    1258 UIC           0 :             heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
    1259                 : 
    1260 GIC        3063 :         if (scan->rs_ctup.t_data == NULL)
    1261                 :         {
    1262              47 :             ExecClearTuple(slot);
    1263              47 :             return false;
    1264                 :         }
    1265                 : 
    1266                 :         /*
    1267                 :          * heap_set_tidrange will have used heap_setscanlimits to limit the
    1268                 :          * range of pages we scan to only ones that can contain the TID range
    1269                 :          * we're scanning for.  Here we must filter out any tuples from these
    1270 ECB             :          * pages that are outside of that range.
    1271                 :          */
    1272 GIC        3016 :         if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
    1273                 :         {
    1274 CBC          93 :             ExecClearTuple(slot);
    1275 ECB             : 
    1276                 :             /*
    1277                 :              * When scanning backwards, the TIDs will be in descending order.
    1278                 :              * Future tuples in this direction will be lower still, so we can
    1279                 :              * just return false to indicate there will be no more tuples.
    1280                 :              */
    1281 CBC          93 :             if (ScanDirectionIsBackward(direction))
    1282 UIC           0 :                 return false;
    1283                 : 
    1284 CBC          93 :             continue;
    1285 ECB             :         }
    1286                 : 
    1287                 :         /*
    1288                 :          * Likewise for the final page, we must filter out TIDs greater than
    1289                 :          * maxtid.
    1290                 :          */
    1291 GIC        2923 :         if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
    1292                 :         {
    1293 CBC          36 :             ExecClearTuple(slot);
    1294 ECB             : 
    1295                 :             /*
    1296                 :              * When scanning forward, the TIDs will be in ascending order.
    1297                 :              * Future tuples in this direction will be higher still, so we can
    1298                 :              * just return false to indicate there will be no more tuples.
    1299                 :              */
    1300 GIC          36 :             if (ScanDirectionIsForward(direction))
    1301              36 :                 return false;
    1302 LBC           0 :             continue;
    1303                 :         }
    1304                 : 
    1305 CBC        2887 :         break;
    1306                 :     }
    1307                 : 
    1308 ECB             :     /*
    1309                 :      * if we get here it means we have a new current scan tuple, so point to
    1310                 :      * the proper return buffer and return the tuple.
    1311                 :      */
    1312 GIC        2887 :     pgstat_count_heap_getnext(scan->rs_base.rs_rd);
    1313                 : 
    1314 CBC        2887 :     ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
    1315            2887 :     return true;
    1316 ECB             : }
    1317                 : 
    1318                 : /*
    1319                 :  *  heap_fetch      - retrieve tuple with given tid
    1320                 :  *
    1321                 :  * On entry, tuple->t_self is the TID to fetch.  We pin the buffer holding
    1322                 :  * the tuple, fill in the remaining fields of *tuple, and check the tuple
    1323                 :  * against the specified snapshot.
    1324                 :  *
    1325                 :  * If successful (tuple found and passes snapshot time qual), then *userbuf
    1326                 :  * is set to the buffer holding the tuple and true is returned.  The caller
    1327                 :  * must unpin the buffer when done with the tuple.
    1328                 :  *
    1329                 :  * If the tuple is not found (ie, item number references a deleted slot),
    1330                 :  * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
    1331                 :  * and false is returned.
    1332                 :  *
    1333                 :  * If the tuple is found but fails the time qual check, then the behavior
    1334                 :  * depends on the keep_buf parameter.  If keep_buf is false, the results
    1335                 :  * are the same as for the tuple-not-found case.  If keep_buf is true,
    1336                 :  * then tuple->t_data and *userbuf are returned as for the success case,
    1337 EUB             :  * and again the caller must unpin the buffer; but false is returned.
    1338                 :  *
    1339                 :  * heap_fetch does not follow HOT chains: only the exact TID requested will
    1340                 :  * be fetched.
    1341                 :  *
    1342                 :  * It is somewhat inconsistent that we ereport() on invalid block number but
    1343 ECB             :  * return false on invalid item number.  There are a couple of reasons though.
    1344                 :  * One is that the caller can relatively easily check the block number for
    1345                 :  * validity, but cannot check the item number without reading the page
    1346 EUB             :  * himself.  Another is that when we are following a t_ctid link, we can be
    1347                 :  * reasonably confident that the page number is valid (since VACUUM shouldn't
    1348                 :  * truncate off the destination page without having killed the referencing
    1349                 :  * tuple first), but the item number might well not be good.
    1350                 :  */
    1351                 : bool
    1352 GIC      196204 : heap_fetch(Relation relation,
    1353                 :            Snapshot snapshot,
    1354                 :            HeapTuple tuple,
    1355 ECB             :            Buffer *userbuf,
    1356                 :            bool keep_buf)
    1357                 : {
    1358 CBC      196204 :     ItemPointer tid = &(tuple->t_self);
    1359 ECB             :     ItemId      lp;
    1360                 :     Buffer      buffer;
    1361                 :     Page        page;
    1362                 :     OffsetNumber offnum;
    1363                 :     bool        valid;
    1364                 : 
    1365                 :     /*
    1366                 :      * Fetch and pin the appropriate page of the relation.
    1367                 :      */
    1368 CBC      196204 :     buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
    1369 ECB             : 
    1370                 :     /*
    1371                 :      * Need share lock on buffer to examine tuple commit status.
    1372                 :      */
    1373 GIC      196204 :     LockBuffer(buffer, BUFFER_LOCK_SHARE);
    1374          196204 :     page = BufferGetPage(buffer);
    1375          196204 :     TestForOldSnapshot(snapshot, relation, page);
    1376                 : 
    1377                 :     /*
    1378                 :      * We'd better check for out-of-range offnum in case of VACUUM since the
    1379                 :      * TID was obtained.
    1380                 :      */
    1381          196204 :     offnum = ItemPointerGetOffsetNumber(tid);
    1382 CBC      196204 :     if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
    1383                 :     {
    1384 LBC           0 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    1385               0 :         ReleaseBuffer(buffer);
    1386 UIC           0 :         *userbuf = InvalidBuffer;
    1387 LBC           0 :         tuple->t_data = NULL;
    1388               0 :         return false;
    1389                 :     }
    1390                 : 
    1391                 :     /*
    1392                 :      * get the item line pointer corresponding to the requested tid
    1393                 :      */
    1394 GIC      196204 :     lp = PageGetItemId(page, offnum);
    1395 ECB             : 
    1396                 :     /*
    1397                 :      * Must check for deleted tuple.
    1398                 :      */
    1399 CBC      196204 :     if (!ItemIdIsNormal(lp))
    1400 ECB             :     {
    1401 CBC           2 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    1402 GIC           2 :         ReleaseBuffer(buffer);
    1403               2 :         *userbuf = InvalidBuffer;
    1404               2 :         tuple->t_data = NULL;
    1405               2 :         return false;
    1406                 :     }
    1407 ECB             : 
    1408                 :     /*
    1409                 :      * fill in *tuple fields
    1410                 :      */
    1411 GIC      196202 :     tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
    1412          196202 :     tuple->t_len = ItemIdGetLength(lp);
    1413          196202 :     tuple->t_tableOid = RelationGetRelid(relation);
    1414                 : 
    1415                 :     /*
    1416                 :      * check tuple visibility, then release lock
    1417                 :      */
    1418          196202 :     valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
    1419                 : 
    1420          196202 :     if (valid)
    1421          196162 :         PredicateLockTID(relation, &(tuple->t_self), snapshot,
    1422 CBC      196162 :                          HeapTupleHeaderGetXmin(tuple->t_data));
    1423                 : 
    1424 GIC      196202 :     HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
    1425 ECB             : 
    1426 CBC      196202 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    1427                 : 
    1428 GIC      196202 :     if (valid)
    1429                 :     {
    1430                 :         /*
    1431                 :          * All checks passed, so return the tuple as valid. Caller is now
    1432                 :          * responsible for releasing the buffer.
    1433                 :          */
    1434          196162 :         *userbuf = buffer;
    1435 ECB             : 
    1436 GIC      196162 :         return true;
    1437                 :     }
    1438                 : 
    1439                 :     /* Tuple failed time qual, but maybe caller wants to see it anyway. */
    1440              40 :     if (keep_buf)
    1441              27 :         *userbuf = buffer;
    1442                 :     else
    1443                 :     {
    1444              13 :         ReleaseBuffer(buffer);
    1445              13 :         *userbuf = InvalidBuffer;
    1446 CBC          13 :         tuple->t_data = NULL;
    1447 ECB             :     }
    1448                 : 
    1449 CBC          40 :     return false;
    1450                 : }
    1451                 : 
    1452                 : /*
    1453                 :  *  heap_hot_search_buffer  - search HOT chain for tuple satisfying snapshot
    1454                 :  *
    1455                 :  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
    1456                 :  * of a HOT chain), and buffer is the buffer holding this tuple.  We search
    1457                 :  * for the first chain member satisfying the given snapshot.  If one is
    1458                 :  * found, we update *tid to reference that tuple's offset number, and
    1459                 :  * return true.  If no match, return false without modifying *tid.
    1460 ECB             :  *
    1461                 :  * heapTuple is a caller-supplied buffer.  When a match is found, we return
    1462                 :  * the tuple here, in addition to updating *tid.  If no match is found, the
    1463                 :  * contents of this buffer on return are undefined.
    1464                 :  *
    1465                 :  * If all_dead is not NULL, we check non-visible tuples to see if they are
    1466                 :  * globally dead; *all_dead is set true if all members of the HOT chain
    1467                 :  * are vacuumable, false if not.
    1468                 :  *
    1469                 :  * Unlike heap_fetch, the caller must already have pin and (at least) share
    1470                 :  * lock on the buffer; it is still pinned/locked at exit.
    1471                 :  */
    1472                 : bool
    1473 GBC    25144495 : heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
    1474 EUB             :                        Snapshot snapshot, HeapTuple heapTuple,
    1475                 :                        bool *all_dead, bool first_call)
    1476 ECB             : {
    1477 GNC    25144495 :     Page        page = BufferGetPage(buffer);
    1478 GIC    25144495 :     TransactionId prev_xmax = InvalidTransactionId;
    1479 EUB             :     BlockNumber blkno;
    1480                 :     OffsetNumber offnum;
    1481                 :     bool        at_chain_start;
    1482                 :     bool        valid;
    1483                 :     bool        skip;
    1484 CBC    25144495 :     GlobalVisState *vistest = NULL;
    1485 ECB             : 
    1486                 :     /* If this is not the first call, previous call returned a (live!) tuple */
    1487 CBC    25144495 :     if (all_dead)
    1488 GIC    21146953 :         *all_dead = first_call;
    1489                 : 
    1490        25144495 :     blkno = ItemPointerGetBlockNumber(tid);
    1491        25144495 :     offnum = ItemPointerGetOffsetNumber(tid);
    1492        25144495 :     at_chain_start = first_call;
    1493 CBC    25144495 :     skip = !first_call;
    1494 ECB             : 
    1495                 :     /* XXX: we should assert that a snapshot is pushed or registered */
    1496 GBC    25144495 :     Assert(TransactionIdIsValid(RecentXmin));
    1497        25144495 :     Assert(BufferGetBlockNumber(buffer) == blkno);
    1498                 : 
    1499                 :     /* Scan through possible multiple members of HOT-chain */
    1500                 :     for (;;)
    1501 GIC      925026 :     {
    1502                 :         ItemId      lp;
    1503                 : 
    1504 ECB             :         /* check for bogus TID */
    1505 GNC    26069521 :         if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
    1506 ECB             :             break;
    1507                 : 
    1508 GNC    26069521 :         lp = PageGetItemId(page, offnum);
    1509                 : 
    1510                 :         /* check for unused, dead, or redirected items */
    1511 GIC    26069521 :         if (!ItemIdIsNormal(lp))
    1512 ECB             :         {
    1513                 :             /* We should only see a redirect at start of chain */
    1514 CBC      806940 :             if (ItemIdIsRedirected(lp) && at_chain_start)
    1515 ECB             :             {
    1516                 :                 /* Follow the redirect */
    1517 CBC      395211 :                 offnum = ItemIdGetRedirect(lp);
    1518          395211 :                 at_chain_start = false;
    1519 GIC      395211 :                 continue;
    1520                 :             }
    1521 ECB             :             /* else must be end of chain */
    1522 CBC      411729 :             break;
    1523 ECB             :         }
    1524                 : 
    1525                 :         /*
    1526                 :          * Update heapTuple to point to the element of the HOT chain we're
    1527                 :          * currently investigating. Having t_self set correctly is important
    1528                 :          * because the SSI checks and the *Satisfies routine for historical
    1529                 :          * MVCC snapshots need the correct tid to decide about the visibility.
    1530                 :          */
    1531 GNC    25262581 :         heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
    1532 GIC    25262581 :         heapTuple->t_len = ItemIdGetLength(lp);
    1533        25262581 :         heapTuple->t_tableOid = RelationGetRelid(relation);
    1534        25262581 :         ItemPointerSet(&heapTuple->t_self, blkno, offnum);
    1535                 : 
    1536                 :         /*
    1537                 :          * Shouldn't see a HEAP_ONLY tuple at chain start.
    1538                 :          */
    1539        25262581 :         if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
    1540 UIC           0 :             break;
    1541                 : 
    1542                 :         /*
    1543                 :          * The xmin should match the previous xmax value, else chain is
    1544                 :          * broken.
    1545 ECB             :          */
    1546 GIC    25792396 :         if (TransactionIdIsValid(prev_xmax) &&
    1547 CBC      529815 :             !TransactionIdEquals(prev_xmax,
    1548 ECB             :                                  HeapTupleHeaderGetXmin(heapTuple->t_data)))
    1549 UIC           0 :             break;
    1550 ECB             : 
    1551                 :         /*
    1552                 :          * When first_call is true (and thus, skip is initially false) we'll
    1553                 :          * return the first tuple we find.  But on later passes, heapTuple
    1554                 :          * will initially be pointing to the tuple we returned last time.
    1555                 :          * Returning it again would be incorrect (and would loop forever), so
    1556                 :          * we skip it and return the next match we find.
    1557                 :          */
    1558 GIC    25262581 :         if (!skip)
    1559                 :         {
    1560 ECB             :             /* If it's visible per the snapshot, we must return it */
    1561 GIC    25088789 :             valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
    1562        25088789 :             HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
    1563                 :                                                 buffer, snapshot);
    1564                 : 
    1565        25088784 :             if (valid)
    1566                 :             {
    1567 CBC    18390562 :                 ItemPointerSetOffsetNumber(tid, offnum);
    1568 GIC    18390562 :                 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
    1569        18390562 :                                  HeapTupleHeaderGetXmin(heapTuple->t_data));
    1570        18390562 :                 if (all_dead)
    1571 CBC    14678740 :                     *all_dead = false;
    1572        18390562 :                 return true;
    1573 ECB             :             }
    1574                 :         }
    1575 CBC     6872014 :         skip = false;
    1576 ECB             : 
    1577                 :         /*
    1578                 :          * If we can't see it, maybe no one else can either.  At caller
    1579                 :          * request, check whether all chain members are dead to all
    1580                 :          * transactions.
    1581                 :          *
    1582                 :          * Note: if you change the criterion here for what is "dead", fix the
    1583                 :          * planner's get_actual_variable_range() function to match.
    1584                 :          */
    1585 CBC     6872014 :         if (all_dead && *all_dead)
    1586 ECB             :         {
    1587 CBC     6466218 :             if (!vistest)
    1588         6381533 :                 vistest = GlobalVisTestFor(relation);
    1589 ECB             : 
    1590 GIC     6466218 :             if (!HeapTupleIsSurelyDead(heapTuple, vistest))
    1591         6143314 :                 *all_dead = false;
    1592                 :         }
    1593                 : 
    1594                 :         /*
    1595 ECB             :          * Check to see if HOT chain continues past this tuple; if so fetch
    1596                 :          * the next offnum and loop around.
    1597                 :          */
    1598 CBC     6872014 :         if (HeapTupleIsHotUpdated(heapTuple))
    1599 ECB             :         {
    1600 CBC      529815 :             Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
    1601                 :                    blkno);
    1602 GIC      529815 :             offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
    1603          529815 :             at_chain_start = false;
    1604          529815 :             prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
    1605                 :         }
    1606                 :         else
    1607                 :             break;              /* end of chain */
    1608                 :     }
    1609                 : 
    1610         6753928 :     return false;
    1611                 : }
    1612                 : 
    1613                 : /*
    1614                 :  *  heap_get_latest_tid -  get the latest tid of a specified tuple
    1615                 :  *
    1616                 :  * Actually, this gets the latest version that is visible according to the
    1617                 :  * scan's snapshot.  Create a scan using SnapshotDirty to get the very latest,
    1618                 :  * possibly uncommitted version.
    1619                 :  *
    1620                 :  * *tid is both an input and an output parameter: it is updated to
    1621                 :  * show the latest version of the row.  Note that it will not be changed
    1622 ECB             :  * if no version of the row passes the snapshot test.
    1623                 :  */
    1624                 : void
    1625 CBC         147 : heap_get_latest_tid(TableScanDesc sscan,
    1626                 :                     ItemPointer tid)
    1627                 : {
    1628             147 :     Relation    relation = sscan->rs_rd;
    1629             147 :     Snapshot    snapshot = sscan->rs_snapshot;
    1630                 :     ItemPointerData ctid;
    1631                 :     TransactionId priorXmax;
    1632 ECB             : 
    1633                 :     /*
    1634                 :      * table_tuple_get_latest_tid() verified that the passed in tid is valid.
    1635                 :      * Assume that t_ctid links are valid however - there shouldn't be invalid
    1636                 :      * ones in the table.
    1637                 :      */
    1638 GIC         147 :     Assert(ItemPointerIsValid(tid));
    1639                 : 
    1640                 :     /*
    1641 ECB             :      * Loop to chase down t_ctid links.  At top of loop, ctid is the tuple we
    1642                 :      * need to examine, and *tid is the TID we will return if ctid turns out
    1643                 :      * to be bogus.
    1644                 :      *
    1645                 :      * Note that we will loop until we reach the end of the t_ctid chain.
    1646                 :      * Depending on the snapshot passed, there might be at most one visible
    1647                 :      * version of the row, but we don't try to optimize for that.
    1648                 :      */
    1649 GIC         147 :     ctid = *tid;
    1650             147 :     priorXmax = InvalidTransactionId;   /* cannot check first XMIN */
    1651                 :     for (;;)
    1652              45 :     {
    1653                 :         Buffer      buffer;
    1654                 :         Page        page;
    1655                 :         OffsetNumber offnum;
    1656                 :         ItemId      lp;
    1657                 :         HeapTupleData tp;
    1658                 :         bool        valid;
    1659                 : 
    1660                 :         /*
    1661                 :          * Read, pin, and lock the page.
    1662                 :          */
    1663             192 :         buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
    1664             192 :         LockBuffer(buffer, BUFFER_LOCK_SHARE);
    1665             192 :         page = BufferGetPage(buffer);
    1666             192 :         TestForOldSnapshot(snapshot, relation, page);
    1667 ECB             : 
    1668                 :         /*
    1669                 :          * Check for bogus item number.  This is not treated as an error
    1670                 :          * condition because it can happen while following a t_ctid link. We
    1671                 :          * just assume that the prior tid is OK and return it unchanged.
    1672                 :          */
    1673 CBC         192 :         offnum = ItemPointerGetOffsetNumber(&ctid);
    1674 GIC         192 :         if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
    1675 ECB             :         {
    1676 UIC           0 :             UnlockReleaseBuffer(buffer);
    1677 LBC           0 :             break;
    1678 ECB             :         }
    1679 CBC         192 :         lp = PageGetItemId(page, offnum);
    1680             192 :         if (!ItemIdIsNormal(lp))
    1681                 :         {
    1682 UIC           0 :             UnlockReleaseBuffer(buffer);
    1683               0 :             break;
    1684                 :         }
    1685                 : 
    1686                 :         /* OK to access the tuple */
    1687 GIC         192 :         tp.t_self = ctid;
    1688             192 :         tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
    1689             192 :         tp.t_len = ItemIdGetLength(lp);
    1690             192 :         tp.t_tableOid = RelationGetRelid(relation);
    1691                 : 
    1692                 :         /*
    1693                 :          * After following a t_ctid link, we might arrive at an unrelated
    1694                 :          * tuple.  Check for XMIN match.
    1695 ECB             :          */
    1696 GIC         237 :         if (TransactionIdIsValid(priorXmax) &&
    1697              45 :             !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
    1698 ECB             :         {
    1699 UIC           0 :             UnlockReleaseBuffer(buffer);
    1700               0 :             break;
    1701                 :         }
    1702                 : 
    1703 ECB             :         /*
    1704                 :          * Check tuple visibility; if visible, set it as the new result
    1705                 :          * candidate.
    1706                 :          */
    1707 GIC         192 :         valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
    1708             192 :         HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
    1709             192 :         if (valid)
    1710             135 :             *tid = ctid;
    1711 ECB             : 
    1712                 :         /*
    1713                 :          * If there's a valid t_ctid link, follow it, else we're done.
    1714                 :          */
    1715 GIC         273 :         if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
    1716             138 :             HeapTupleHeaderIsOnlyLocked(tp.t_data) ||
    1717             114 :             HeapTupleHeaderIndicatesMovedPartitions(tp.t_data) ||
    1718              57 :             ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
    1719 ECB             :         {
    1720 CBC         147 :             UnlockReleaseBuffer(buffer);
    1721 GIC         147 :             break;
    1722 ECB             :         }
    1723                 : 
    1724 GIC          45 :         ctid = tp.t_data->t_ctid;
    1725              45 :         priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
    1726 CBC          45 :         UnlockReleaseBuffer(buffer);
    1727 ECB             :     }                           /* end of loop */
    1728 CBC         147 : }
    1729 ECB             : 
    1730                 : 
    1731                 : /*
    1732                 :  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
    1733                 :  *
    1734                 :  * This is called after we have waited for the XMAX transaction to terminate.
    1735                 :  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
    1736                 :  * be set on exit.  If the transaction committed, we set the XMAX_COMMITTED
    1737                 :  * hint bit if possible --- but beware that that may not yet be possible,
    1738                 :  * if the transaction committed asynchronously.
    1739                 :  *
    1740                 :  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
    1741                 :  * even if it commits.
    1742                 :  *
    1743                 :  * Hence callers should look only at XMAX_INVALID.
    1744                 :  *
    1745                 :  * Note this is not allowed for tuples whose xmax is a multixact.
    1746                 :  */
    1747                 : static void
    1748 GIC         158 : UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
    1749 ECB             : {
    1750 CBC         158 :     Assert(TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple), xid));
    1751 GIC         158 :     Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
    1752 ECB             : 
    1753 CBC         158 :     if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
    1754 ECB             :     {
    1755 GIC         288 :         if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
    1756             130 :             TransactionIdDidCommit(xid))
    1757             104 :             HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED,
    1758                 :                                  xid);
    1759                 :         else
    1760              54 :             HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
    1761 ECB             :                                  InvalidTransactionId);
    1762                 :     }
    1763 GIC         158 : }
    1764 ECB             : 
    1765                 : 
    1766                 : /*
    1767                 :  * GetBulkInsertState - prepare status object for a bulk insert
    1768                 :  */
    1769                 : BulkInsertState
    1770 GIC        2215 : GetBulkInsertState(void)
    1771 ECB             : {
    1772                 :     BulkInsertState bistate;
    1773                 : 
    1774 GIC        2215 :     bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
    1775            2215 :     bistate->strategy = GetAccessStrategy(BAS_BULKWRITE);
    1776 CBC        2215 :     bistate->current_buf = InvalidBuffer;
    1777 GNC        2215 :     bistate->next_free = InvalidBlockNumber;
    1778            2215 :     bistate->last_free = InvalidBlockNumber;
    1779 GIC        2215 :     return bistate;
    1780 ECB             : }
    1781                 : 
    1782                 : /*
    1783                 :  * FreeBulkInsertState - clean up after finishing a bulk insert
    1784                 :  */
    1785                 : void
    1786 GIC        2116 : FreeBulkInsertState(BulkInsertState bistate)
    1787                 : {
    1788            2116 :     if (bistate->current_buf != InvalidBuffer)
    1789            1766 :         ReleaseBuffer(bistate->current_buf);
    1790 CBC        2116 :     FreeAccessStrategy(bistate->strategy);
    1791 GIC        2116 :     pfree(bistate);
    1792            2116 : }
    1793 ECB             : 
    1794                 : /*
    1795                 :  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
    1796                 :  */
    1797                 : void
    1798 GIC       50755 : ReleaseBulkInsertStatePin(BulkInsertState bistate)
    1799 ECB             : {
    1800 GIC       50755 :     if (bistate->current_buf != InvalidBuffer)
    1801 CBC          24 :         ReleaseBuffer(bistate->current_buf);
    1802           50755 :     bistate->current_buf = InvalidBuffer;
    1803 GIC       50755 : }
    1804 ECB             : 
    1805                 : 
    1806                 : /*
    1807                 :  *  heap_insert     - insert tuple into a heap
    1808                 :  *
    1809                 :  * The new tuple is stamped with current transaction ID and the specified
    1810                 :  * command ID.
    1811                 :  *
    1812                 :  * See table_tuple_insert for comments about most of the input flags, except
    1813                 :  * that this routine directly takes a tuple rather than a slot.
    1814                 :  *
    1815                 :  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
    1816                 :  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
    1817                 :  * implement table_tuple_insert_speculative().
    1818                 :  *
    1819                 :  * On return the header fields of *tup are updated to match the stored tuple;
    1820                 :  * in particular tup->t_self receives the actual TID where the tuple was
    1821                 :  * stored.  But note that any toasting of fields within the tuple data is NOT
    1822                 :  * reflected into *tup.
    1823 EUB             :  */
    1824                 : void
    1825 GIC    11835861 : heap_insert(Relation relation, HeapTuple tup, CommandId cid,
    1826                 :             int options, BulkInsertState bistate)
    1827 ECB             : {
    1828 CBC    11835861 :     TransactionId xid = GetCurrentTransactionId();
    1829 ECB             :     HeapTuple   heaptup;
    1830                 :     Buffer      buffer;
    1831 CBC    11835861 :     Buffer      vmbuffer = InvalidBuffer;
    1832        11835861 :     bool        all_visible_cleared = false;
    1833                 : 
    1834 ECB             :     /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
    1835 CBC    11835861 :     Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
    1836 ECB             :            RelationGetNumberOfAttributes(relation));
    1837                 : 
    1838                 :     /*
    1839                 :      * Fill in tuple header fields and toast the tuple if necessary.
    1840                 :      *
    1841                 :      * Note: below this point, heaptup is the data we actually intend to store
    1842                 :      * into the relation; tup is the caller's original untoasted data.
    1843                 :      */
    1844 GIC    11835861 :     heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
    1845                 : 
    1846 ECB             :     /*
    1847                 :      * Find buffer to insert this tuple into.  If the page is all visible,
    1848                 :      * this will also pin the requisite visibility map page.
    1849                 :      */
    1850 CBC    11835861 :     buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
    1851                 :                                        InvalidBuffer, options, bistate,
    1852                 :                                        &vmbuffer, NULL,
    1853                 :                                        0);
    1854                 : 
    1855                 :     /*
    1856                 :      * We're about to do the actual insert -- but check for conflict first, to
    1857                 :      * avoid possibly having to roll back work we've just done.
    1858                 :      *
    1859                 :      * This is safe without a recheck as long as there is no possibility of
    1860                 :      * another process scanning the page between this check and the insert
    1861                 :      * being visible to the scan (i.e., an exclusive buffer content lock is
    1862 ECB             :      * continuously held from this point until the tuple insert is visible).
    1863                 :      *
    1864                 :      * For a heap insert, we only need to check for table-level SSI locks. Our
    1865                 :      * new tuple can't possibly conflict with existing tuple locks, and heap
    1866                 :      * page locks are only consolidated versions of tuple locks; they do not
    1867                 :      * lock "gaps" as index page locks do.  So we don't need to specify a
    1868                 :      * buffer when making the call, which makes for a faster check.
    1869                 :      */
    1870 GIC    11835861 :     CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
    1871 ECB             : 
    1872                 :     /* NO EREPORT(ERROR) from here till changes are logged */
    1873 CBC    11835849 :     START_CRIT_SECTION();
    1874 ECB             : 
    1875 GIC    11835849 :     RelationPutHeapTuple(relation, buffer, heaptup,
    1876 CBC    11835849 :                          (options & HEAP_INSERT_SPECULATIVE) != 0);
    1877                 : 
    1878 GIC    11835849 :     if (PageIsAllVisible(BufferGetPage(buffer)))
    1879 ECB             :     {
    1880 GIC        7837 :         all_visible_cleared = true;
    1881            7837 :         PageClearAllVisible(BufferGetPage(buffer));
    1882            7837 :         visibilitymap_clear(relation,
    1883            7837 :                             ItemPointerGetBlockNumber(&(heaptup->t_self)),
    1884                 :                             vmbuffer, VISIBILITYMAP_VALID_BITS);
    1885                 :     }
    1886                 : 
    1887                 :     /*
    1888                 :      * XXX Should we set PageSetPrunable on this page ?
    1889                 :      *
    1890                 :      * The inserting transaction may eventually abort thus making this tuple
    1891                 :      * DEAD and hence available for pruning. Though we don't want to optimize
    1892                 :      * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
    1893                 :      * aborted tuple will never be pruned until next vacuum is triggered.
    1894 ECB             :      *
    1895                 :      * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
    1896                 :      */
    1897                 : 
    1898 GIC    11835849 :     MarkBufferDirty(buffer);
    1899                 : 
    1900                 :     /* XLOG stuff */
    1901        11835849 :     if (RelationNeedsWAL(relation))
    1902                 :     {
    1903 ECB             :         xl_heap_insert xlrec;
    1904                 :         xl_heap_header xlhdr;
    1905                 :         XLogRecPtr  recptr;
    1906 CBC    10832412 :         Page        page = BufferGetPage(buffer);
    1907        10832412 :         uint8       info = XLOG_HEAP_INSERT;
    1908        10832412 :         int         bufflags = 0;
    1909 ECB             : 
    1910                 :         /*
    1911                 :          * If this is a catalog, we need to transmit combo CIDs to properly
    1912                 :          * decode, so log that as well.
    1913                 :          */
    1914 GIC    10832412 :         if (RelationIsAccessibleInLogicalDecoding(relation))
    1915 CBC        4420 :             log_heap_new_cid(relation, heaptup);
    1916 ECB             : 
    1917                 :         /*
    1918                 :          * If this is the single and first tuple on page, we can reinit the
    1919                 :          * page instead of restoring the whole thing.  Set flag, and hide
    1920                 :          * buffer references from XLogInsert.
    1921                 :          */
    1922 GIC    11002229 :         if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
    1923          169817 :             PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
    1924                 :         {
    1925 CBC      168908 :             info |= XLOG_HEAP_INIT_PAGE;
    1926          168908 :             bufflags |= REGBUF_WILL_INIT;
    1927 ECB             :         }
    1928                 : 
    1929 GIC    10832412 :         xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
    1930        10832412 :         xlrec.flags = 0;
    1931        10832412 :         if (all_visible_cleared)
    1932            7834 :             xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED;
    1933        10832412 :         if (options & HEAP_INSERT_SPECULATIVE)
    1934            2007 :             xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
    1935        10832412 :         Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
    1936                 : 
    1937                 :         /*
    1938                 :          * For logical decoding, we need the tuple even if we're doing a full
    1939                 :          * page write, so make sure it's included even if we take a full-page
    1940                 :          * image. (XXX We could alternatively store a pointer into the FPW).
    1941                 :          */
    1942        10832412 :         if (RelationIsLogicallyLogged(relation) &&
    1943          304647 :             !(options & HEAP_INSERT_NO_LOGICAL))
    1944                 :         {
    1945          304620 :             xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
    1946          304620 :             bufflags |= REGBUF_KEEP_DATA;
    1947                 : 
    1948          304620 :             if (IsToastRelation(relation))
    1949            1681 :                 xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
    1950                 :         }
    1951                 : 
    1952        10832412 :         XLogBeginInsert();
    1953        10832412 :         XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
    1954                 : 
    1955 CBC    10832412 :         xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
    1956 GIC    10832412 :         xlhdr.t_infomask = heaptup->t_data->t_infomask;
    1957 CBC    10832412 :         xlhdr.t_hoff = heaptup->t_data->t_hoff;
    1958 ECB             : 
    1959                 :         /*
    1960                 :          * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
    1961                 :          * write the whole page to the xlog, we don't need to store
    1962                 :          * xl_heap_header in the xlog.
    1963                 :          */
    1964 GIC    10832412 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
    1965 CBC    10832412 :         XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
    1966                 :         /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
    1967 GIC    10832412 :         XLogRegisterBufData(0,
    1968        10832412 :                             (char *) heaptup->t_data + SizeofHeapTupleHeader,
    1969        10832412 :                             heaptup->t_len - SizeofHeapTupleHeader);
    1970                 : 
    1971                 :         /* filtering by origin on a row level is much more efficient */
    1972        10832412 :         XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
    1973                 : 
    1974        10832412 :         recptr = XLogInsert(RM_HEAP_ID, info);
    1975                 : 
    1976        10832412 :         PageSetLSN(page, recptr);
    1977 ECB             :     }
    1978                 : 
    1979 CBC    11835849 :     END_CRIT_SECTION();
    1980                 : 
    1981        11835849 :     UnlockReleaseBuffer(buffer);
    1982 GIC    11835849 :     if (vmbuffer != InvalidBuffer)
    1983            7946 :         ReleaseBuffer(vmbuffer);
    1984 ECB             : 
    1985                 :     /*
    1986                 :      * If tuple is cachable, mark it for invalidation from the caches in case
    1987                 :      * we abort.  Note it is OK to do this after releasing the buffer, because
    1988                 :      * the heaptup data structure is all in local memory, not in the shared
    1989                 :      * buffer.
    1990                 :      */
    1991 GIC    11835849 :     CacheInvalidateHeapTuple(relation, heaptup, NULL);
    1992                 : 
    1993 ECB             :     /* Note: speculative insertions are counted too, even if aborted later */
    1994 GIC    11835849 :     pgstat_count_heap_insert(relation, 1);
    1995                 : 
    1996                 :     /*
    1997 ECB             :      * If heaptup is a private copy, release it.  Don't forget to copy t_self
    1998                 :      * back to the caller's image, too.
    1999                 :      */
    2000 GIC    11835849 :     if (heaptup != tup)
    2001 ECB             :     {
    2002 CBC       60551 :         tup->t_self = heaptup->t_self;
    2003 GIC       60551 :         heap_freetuple(heaptup);
    2004                 :     }
    2005 CBC    11835849 : }
    2006                 : 
    2007                 : /*
    2008                 :  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
    2009                 :  * tuple header fields and toasts the tuple if necessary.  Returns a toasted
    2010                 :  * version of the tuple if it was toasted, or the original tuple if not. Note
    2011 ECB             :  * that in any case, the header fields are also set in the original tuple.
    2012                 :  */
    2013                 : static HeapTuple
    2014 GIC    14837378 : heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
    2015                 :                     CommandId cid, int options)
    2016                 : {
    2017 ECB             :     /*
    2018                 :      * To allow parallel inserts, we need to ensure that they are safe to be
    2019                 :      * performed in workers. We have the infrastructure to allow parallel
    2020                 :      * inserts in general except for the cases where inserts generate a new
    2021                 :      * CommandId (eg. inserts into a table having a foreign key column).
    2022                 :      */
    2023 GIC    14837378 :     if (IsParallelWorker())
    2024 LBC           0 :         ereport(ERROR,
    2025 ECB             :                 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
    2026                 :                  errmsg("cannot insert tuples in a parallel worker")));
    2027                 : 
    2028 GIC    14837378 :     tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
    2029        14837378 :     tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
    2030        14837378 :     tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
    2031        14837378 :     HeapTupleHeaderSetXmin(tup->t_data, xid);
    2032        14837378 :     if (options & HEAP_INSERT_FROZEN)
    2033 CBC      100337 :         HeapTupleHeaderSetXminFrozen(tup->t_data);
    2034 ECB             : 
    2035 GIC    14837378 :     HeapTupleHeaderSetCmin(tup->t_data, cid);
    2036        14837378 :     HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
    2037        14837378 :     tup->t_tableOid = RelationGetRelid(relation);
    2038                 : 
    2039                 :     /*
    2040                 :      * If the new tuple is too big for storage or contains already toasted
    2041                 :      * out-of-line attributes from some other relation, invoke the toaster.
    2042                 :      */
    2043        14837378 :     if (relation->rd_rel->relkind != RELKIND_RELATION &&
    2044 CBC      108856 :         relation->rd_rel->relkind != RELKIND_MATVIEW)
    2045                 :     {
    2046 ECB             :         /* toast table entries should never be recursively toasted */
    2047 CBC      107203 :         Assert(!HeapTupleHasExternal(tup));
    2048          107203 :         return tup;
    2049                 :     }
    2050 GIC    14730175 :     else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
    2051           60586 :         return heap_toast_insert_or_update(relation, tup, NULL, options);
    2052 ECB             :     else
    2053 CBC    14669589 :         return tup;
    2054                 : }
    2055                 : 
    2056                 : /*
    2057                 :  * Helper for heap_multi_insert() that computes the number of entire pages
    2058                 :  * that inserting the remaining heaptuples requires. Used to determine how
    2059                 :  * much the relation needs to be extended by.
    2060                 :  */
    2061                 : static int
    2062 GNC      827189 : heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
    2063                 : {
    2064          827189 :     size_t      page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
    2065          827189 :     int         npages = 1;
    2066                 : 
    2067         4585093 :     for (int i = done; i < ntuples; i++)
    2068                 :     {
    2069         3757904 :         size_t      tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
    2070                 : 
    2071         3757904 :         if (page_avail < tup_sz)
    2072                 :         {
    2073           18305 :             npages++;
    2074           18305 :             page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
    2075                 :         }
    2076         3757904 :         page_avail -= tup_sz;
    2077                 :     }
    2078                 : 
    2079          827189 :     return npages;
    2080                 : }
    2081                 : 
    2082                 : /*
    2083                 :  *  heap_multi_insert   - insert multiple tuples into a heap
    2084                 :  *
    2085 ECB             :  * This is like heap_insert(), but inserts multiple tuples in one operation.
    2086                 :  * That's faster than calling heap_insert() in a loop, because when multiple
    2087                 :  * tuples can be inserted on a single page, we can write just a single WAL
    2088                 :  * record covering all of them, and only need to lock/unlock the page once.
    2089                 :  *
    2090                 :  * Note: this leaks memory into the current memory context. You can create a
    2091                 :  * temporary context before calling this, if that's a problem.
    2092                 :  */
    2093                 : void
    2094 GIC      810261 : heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
    2095 ECB             :                   CommandId cid, int options, BulkInsertState bistate)
    2096                 : {
    2097 CBC      810261 :     TransactionId xid = GetCurrentTransactionId();
    2098                 :     HeapTuple  *heaptuples;
    2099                 :     int         i;
    2100                 :     int         ndone;
    2101                 :     PGAlignedBlock scratch;
    2102                 :     Page        page;
    2103          810261 :     Buffer      vmbuffer = InvalidBuffer;
    2104                 :     bool        needwal;
    2105                 :     Size        saveFreeSpace;
    2106          810261 :     bool        need_tuple_data = RelationIsLogicallyLogged(relation);
    2107          810261 :     bool        need_cids = RelationIsAccessibleInLogicalDecoding(relation);
    2108 GNC      810261 :     bool        starting_with_empty_page = false;
    2109          810261 :     int         npages = 0;
    2110          810261 :     int         npages_used = 0;
    2111                 : 
    2112                 :     /* currently not needed (thus unsupported) for heap_multi_insert() */
    2113          810261 :     Assert(!(options & HEAP_INSERT_NO_LOGICAL));
    2114                 : 
    2115 GIC      810261 :     needwal = RelationNeedsWAL(relation);
    2116          810261 :     saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
    2117                 :                                                    HEAP_DEFAULT_FILLFACTOR);
    2118 ECB             : 
    2119                 :     /* Toast and set header data in all the slots */
    2120 GIC      810261 :     heaptuples = palloc(ntuples * sizeof(HeapTuple));
    2121         3811778 :     for (i = 0; i < ntuples; i++)
    2122 ECB             :     {
    2123                 :         HeapTuple   tuple;
    2124                 : 
    2125 CBC     3001517 :         tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
    2126 GIC     3001517 :         slots[i]->tts_tableOid = RelationGetRelid(relation);
    2127 CBC     3001517 :         tuple->t_tableOid = slots[i]->tts_tableOid;
    2128         3001517 :         heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
    2129 ECB             :                                             options);
    2130                 :     }
    2131                 : 
    2132                 :     /*
    2133                 :      * We're about to do the actual inserts -- but check for conflict first,
    2134                 :      * to minimize the possibility of having to roll back work we've just
    2135                 :      * done.
    2136                 :      *
    2137                 :      * A check here does not definitively prevent a serialization anomaly;
    2138                 :      * that check MUST be done at least past the point of acquiring an
    2139                 :      * exclusive buffer content lock on every buffer that will be affected,
    2140                 :      * and MAY be done after all inserts are reflected in the buffers and
    2141                 :      * those locks are released; otherwise there is a race condition.  Since
    2142                 :      * multiple buffers can be locked and unlocked in the loop below, and it
    2143                 :      * would not be feasible to identify and lock all of those buffers before
    2144                 :      * the loop, we must do a final check at the end.
    2145                 :      *
    2146                 :      * The check here could be omitted with no loss of correctness; it is
    2147                 :      * present strictly as an optimization.
    2148                 :      *
    2149                 :      * For heap inserts, we only need to check for table-level SSI locks. Our
    2150                 :      * new tuples can't possibly conflict with existing tuple locks, and heap
    2151                 :      * page locks are only consolidated versions of tuple locks; they do not
    2152                 :      * lock "gaps" as index page locks do.  So we don't need to specify a
    2153                 :      * buffer when making the call, which makes for a faster check.
    2154                 :      */
    2155 GIC      810261 :     CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
    2156 ECB             : 
    2157 CBC      810261 :     ndone = 0;
    2158         1647735 :     while (ndone < ntuples)
    2159                 :     {
    2160 ECB             :         Buffer      buffer;
    2161 GIC      837474 :         bool        all_visible_cleared = false;
    2162 CBC      837474 :         bool        all_frozen_set = false;
    2163 ECB             :         int         nthispage;
    2164                 : 
    2165 CBC      837474 :         CHECK_FOR_INTERRUPTS();
    2166 ECB             : 
    2167                 :         /*
    2168                 :          * Compute number of pages needed to fit the to-be-inserted tuples in
    2169                 :          * the worst case.  This will be used to determine how much to extend
    2170                 :          * the relation by in RelationGetBufferForTuple(), if needed.  If we
    2171                 :          * filled a prior page from scratch, we can just update our last
    2172                 :          * computation, but if we started with a partially filled page,
    2173                 :          * recompute from scratch, the number of potentially required pages
    2174                 :          * can vary due to tuples needing to fit onto the page, page headers
    2175                 :          * etc.
    2176                 :          */
    2177 GNC      837474 :         if (ndone == 0 || !starting_with_empty_page)
    2178                 :         {
    2179          827189 :             npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
    2180                 :                                              saveFreeSpace);
    2181          827189 :             npages_used = 0;
    2182                 :         }
    2183                 :         else
    2184           10285 :             npages_used++;
    2185                 : 
    2186                 :         /*
    2187                 :          * Find buffer where at least the next tuple will fit.  If the page is
    2188                 :          * all-visible, this will also pin the requisite visibility map page.
    2189                 :          *
    2190                 :          * Also pin visibility map page if COPY FREEZE inserts tuples into an
    2191                 :          * empty page. See all_frozen_set below.
    2192 ECB             :          */
    2193 CBC      837474 :         buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
    2194                 :                                            InvalidBuffer, options, bistate,
    2195                 :                                            &vmbuffer, NULL,
    2196                 :                                            npages - npages_used);
    2197 GIC      837474 :         page = BufferGetPage(buffer);
    2198 ECB             : 
    2199 CBC      837474 :         starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
    2200                 : 
    2201 GIC      837474 :         if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
    2202            1658 :             all_frozen_set = true;
    2203                 : 
    2204                 :         /* NO EREPORT(ERROR) from here till changes are logged */
    2205          837474 :         START_CRIT_SECTION();
    2206 ECB             : 
    2207                 :         /*
    2208                 :          * RelationGetBufferForTuple has ensured that the first tuple fits.
    2209                 :          * Put that on the page, and then as many other tuples as fit.
    2210                 :          */
    2211 CBC      837474 :         RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
    2212                 : 
    2213 ECB             :         /*
    2214                 :          * For logical decoding we need combo CIDs to properly decode the
    2215                 :          * catalog.
    2216                 :          */
    2217 GIC      837474 :         if (needwal && need_cids)
    2218 CBC        6295 :             log_heap_new_cid(relation, heaptuples[ndone]);
    2219                 : 
    2220         3001517 :         for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
    2221                 :         {
    2222 GIC     2191256 :             HeapTuple   heaptup = heaptuples[ndone + nthispage];
    2223 ECB             : 
    2224 GIC     2191256 :             if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
    2225           27213 :                 break;
    2226                 : 
    2227         2164043 :             RelationPutHeapTuple(relation, buffer, heaptup, false);
    2228                 : 
    2229 ECB             :             /*
    2230                 :              * For logical decoding we need combo CIDs to properly decode the
    2231                 :              * catalog.
    2232                 :              */
    2233 GIC     2164043 :             if (needwal && need_cids)
    2234            5756 :                 log_heap_new_cid(relation, heaptup);
    2235                 :         }
    2236                 : 
    2237                 :         /*
    2238                 :          * If the page is all visible, need to clear that, unless we're only
    2239 ECB             :          * going to add further frozen rows to it.
    2240                 :          *
    2241                 :          * If we're only adding already frozen rows to a previously empty
    2242                 :          * page, mark it as all-visible.
    2243                 :          */
    2244 GIC      837474 :         if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
    2245 ECB             :         {
    2246 CBC        2492 :             all_visible_cleared = true;
    2247 GIC        2492 :             PageClearAllVisible(page);
    2248            2492 :             visibilitymap_clear(relation,
    2249                 :                                 BufferGetBlockNumber(buffer),
    2250                 :                                 vmbuffer, VISIBILITYMAP_VALID_BITS);
    2251                 :         }
    2252          834982 :         else if (all_frozen_set)
    2253            1658 :             PageSetAllVisible(page);
    2254                 : 
    2255                 :         /*
    2256 ECB             :          * XXX Should we set PageSetPrunable on this page ? See heap_insert()
    2257                 :          */
    2258                 : 
    2259 GIC      837474 :         MarkBufferDirty(buffer);
    2260                 : 
    2261                 :         /* XLOG stuff */
    2262          837474 :         if (needwal)
    2263                 :         {
    2264                 :             XLogRecPtr  recptr;
    2265                 :             xl_heap_multi_insert *xlrec;
    2266          833127 :             uint8       info = XLOG_HEAP2_MULTI_INSERT;
    2267                 :             char       *tupledata;
    2268                 :             int         totaldatalen;
    2269          833127 :             char       *scratchptr = scratch.data;
    2270                 :             bool        init;
    2271          833127 :             int         bufflags = 0;
    2272                 : 
    2273 ECB             :             /*
    2274                 :              * If the page was previously empty, we can reinit the page
    2275                 :              * instead of restoring the whole thing.
    2276                 :              */
    2277 GIC      833127 :             init = starting_with_empty_page;
    2278                 : 
    2279                 :             /* allocate xl_heap_multi_insert struct from the scratch area */
    2280          833127 :             xlrec = (xl_heap_multi_insert *) scratchptr;
    2281 CBC      833127 :             scratchptr += SizeOfHeapMultiInsert;
    2282                 : 
    2283 ECB             :             /*
    2284                 :              * Allocate offsets array. Unless we're reinitializing the page,
    2285                 :              * in that case the tuples are stored in order starting at
    2286                 :              * FirstOffsetNumber and we don't need to store the offsets
    2287                 :              * explicitly.
    2288                 :              */
    2289 CBC      833127 :             if (!init)
    2290 GIC      800833 :                 scratchptr += nthispage * sizeof(OffsetNumber);
    2291 ECB             : 
    2292                 :             /* the rest of the scratch space is used for tuple data */
    2293 GIC      833127 :             tupledata = scratchptr;
    2294                 : 
    2295                 :             /* check that the mutually exclusive flags are not both set */
    2296          833127 :             Assert(!(all_visible_cleared && all_frozen_set));
    2297                 : 
    2298          833127 :             xlrec->flags = 0;
    2299          833127 :             if (all_visible_cleared)
    2300            2492 :                 xlrec->flags = XLH_INSERT_ALL_VISIBLE_CLEARED;
    2301          833127 :             if (all_frozen_set)
    2302              14 :                 xlrec->flags = XLH_INSERT_ALL_FROZEN_SET;
    2303                 : 
    2304 CBC      833127 :             xlrec->ntuples = nthispage;
    2305                 : 
    2306 ECB             :             /*
    2307                 :              * Write out an xl_multi_insert_tuple and the tuple data itself
    2308                 :              * for each tuple.
    2309                 :              */
    2310 GIC     3529283 :             for (i = 0; i < nthispage; i++)
    2311                 :             {
    2312         2696156 :                 HeapTuple   heaptup = heaptuples[ndone + i];
    2313                 :                 xl_multi_insert_tuple *tuphdr;
    2314                 :                 int         datalen;
    2315                 : 
    2316         2696156 :                 if (!init)
    2317 CBC     1884242 :                     xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
    2318                 :                 /* xl_multi_insert_tuple needs two-byte alignment. */
    2319 GIC     2696156 :                 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
    2320 CBC     2696156 :                 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
    2321 ECB             : 
    2322 CBC     2696156 :                 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
    2323 GIC     2696156 :                 tuphdr->t_infomask = heaptup->t_data->t_infomask;
    2324 CBC     2696156 :                 tuphdr->t_hoff = heaptup->t_data->t_hoff;
    2325                 : 
    2326 ECB             :                 /* write bitmap [+ padding] [+ oid] + data */
    2327 GIC     2696156 :                 datalen = heaptup->t_len - SizeofHeapTupleHeader;
    2328         2696156 :                 memcpy(scratchptr,
    2329         2696156 :                        (char *) heaptup->t_data + SizeofHeapTupleHeader,
    2330                 :                        datalen);
    2331         2696156 :                 tuphdr->datalen = datalen;
    2332         2696156 :                 scratchptr += datalen;
    2333                 :             }
    2334          833127 :             totaldatalen = scratchptr - tupledata;
    2335          833127 :             Assert((scratchptr - scratch.data) < BLCKSZ);
    2336                 : 
    2337          833127 :             if (need_tuple_data)
    2338             167 :                 xlrec->flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
    2339 ECB             : 
    2340                 :             /*
    2341                 :              * Signal that this is the last xl_heap_multi_insert record
    2342                 :              * emitted by this call to heap_multi_insert(). Needed for logical
    2343                 :              * decoding so it knows when to cleanup temporary data.
    2344                 :              */
    2345 CBC      833127 :             if (ndone + nthispage == ntuples)
    2346 GIC      809765 :                 xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
    2347 ECB             : 
    2348 GIC      833127 :             if (init)
    2349                 :             {
    2350           32294 :                 info |= XLOG_HEAP_INIT_PAGE;
    2351           32294 :                 bufflags |= REGBUF_WILL_INIT;
    2352                 :             }
    2353                 : 
    2354                 :             /*
    2355                 :              * If we're doing logical decoding, include the new tuple data
    2356                 :              * even if we take a full-page image of the page.
    2357                 :              */
    2358          833127 :             if (need_tuple_data)
    2359             167 :                 bufflags |= REGBUF_KEEP_DATA;
    2360                 : 
    2361          833127 :             XLogBeginInsert();
    2362 CBC      833127 :             XLogRegisterData((char *) xlrec, tupledata - scratch.data);
    2363 GIC      833127 :             XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
    2364                 : 
    2365          833127 :             XLogRegisterBufData(0, tupledata, totaldatalen);
    2366                 : 
    2367 ECB             :             /* filtering by origin on a row level is much more efficient */
    2368 GIC      833127 :             XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
    2369                 : 
    2370          833127 :             recptr = XLogInsert(RM_HEAP2_ID, info);
    2371                 : 
    2372          833127 :             PageSetLSN(page, recptr);
    2373 ECB             :         }
    2374                 : 
    2375 GIC      837474 :         END_CRIT_SECTION();
    2376                 : 
    2377 ECB             :         /*
    2378                 :          * If we've frozen everything on the page, update the visibilitymap.
    2379                 :          * We're already holding pin on the vmbuffer.
    2380                 :          */
    2381 CBC      837474 :         if (all_frozen_set)
    2382                 :         {
    2383            1658 :             Assert(PageIsAllVisible(page));
    2384 GIC        1658 :             Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
    2385                 : 
    2386                 :             /*
    2387                 :              * It's fine to use InvalidTransactionId here - this is only used
    2388                 :              * when HEAP_INSERT_FROZEN is specified, which intentionally
    2389                 :              * violates visibility rules.
    2390 ECB             :              */
    2391 GBC        1658 :             visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
    2392                 :                               InvalidXLogRecPtr, vmbuffer,
    2393                 :                               InvalidTransactionId,
    2394                 :                               VISIBILITYMAP_ALL_VISIBLE | VISIBILITYMAP_ALL_FROZEN);
    2395 ECB             :         }
    2396                 : 
    2397 CBC      837474 :         UnlockReleaseBuffer(buffer);
    2398 GIC      837474 :         ndone += nthispage;
    2399                 : 
    2400                 :         /*
    2401                 :          * NB: Only release vmbuffer after inserting all tuples - it's fairly
    2402                 :          * likely that we'll insert into subsequent heap pages that are likely
    2403                 :          * to use the same vm page.
    2404                 :          */
    2405 ECB             :     }
    2406                 : 
    2407                 :     /* We're done with inserting all tuples, so release the last vmbuffer. */
    2408 CBC      810261 :     if (vmbuffer != InvalidBuffer)
    2409 GIC        2597 :         ReleaseBuffer(vmbuffer);
    2410 ECB             : 
    2411                 :     /*
    2412                 :      * We're done with the actual inserts.  Check for conflicts again, to
    2413                 :      * ensure that all rw-conflicts in to these inserts are detected.  Without
    2414                 :      * this final check, a sequential scan of the heap may have locked the
    2415                 :      * table after the "before" check, missing one opportunity to detect the
    2416                 :      * conflict, and then scanned the table before the new tuples were there,
    2417                 :      * missing the other chance to detect the conflict.
    2418                 :      *
    2419                 :      * For heap inserts, we only need to check for table-level SSI locks. Our
    2420                 :      * new tuples can't possibly conflict with existing tuple locks, and heap
    2421                 :      * page locks are only consolidated versions of tuple locks; they do not
    2422                 :      * lock "gaps" as index page locks do.  So we don't need to specify a
    2423                 :      * buffer when making the call.
    2424                 :      */
    2425 GIC      810261 :     CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
    2426 ECB             : 
    2427                 :     /*
    2428 EUB             :      * If tuples are cachable, mark them for invalidation from the caches in
    2429                 :      * case we abort.  Note it is OK to do this after releasing the buffer,
    2430                 :      * because the heaptuples data structure is all in local memory, not in
    2431                 :      * the shared buffer.
    2432                 :      */
    2433 CBC      810261 :     if (IsCatalogRelation(relation))
    2434                 :     {
    2435         2916673 :         for (i = 0; i < ntuples; i++)
    2436 GIC     2107904 :             CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
    2437 EUB             :     }
    2438                 : 
    2439                 :     /* copy t_self fields back to the caller's slots */
    2440 GIC     3811778 :     for (i = 0; i < ntuples; i++)
    2441         3001517 :         slots[i]->tts_tid = heaptuples[i]->t_self;
    2442 ECB             : 
    2443 GIC      810261 :     pgstat_count_heap_insert(relation, ntuples);
    2444          810261 : }
    2445                 : 
    2446                 : /*
    2447                 :  *  simple_heap_insert - insert a tuple
    2448 ECB             :  *
    2449                 :  * Currently, this routine differs from heap_insert only in supplying
    2450                 :  * a default command ID and not allowing access to the speedup options.
    2451                 :  *
    2452                 :  * This should be used rather than using heap_insert directly in most places
    2453                 :  * where we are modifying system catalogs.
    2454                 :  */
    2455                 : void
    2456 GIC     4591015 : simple_heap_insert(Relation relation, HeapTuple tup)
    2457                 : {
    2458         4591015 :     heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
    2459         4591015 : }
    2460                 : 
    2461                 : /*
    2462                 :  * Given infomask/infomask2, compute the bits that must be saved in the
    2463                 :  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
    2464 ECB             :  * xl_heap_lock_updated WAL records.
    2465                 :  *
    2466                 :  * See fix_infomask_from_infobits.
    2467                 :  */
    2468                 : static uint8
    2469 GIC     2034567 : compute_infobits(uint16 infomask, uint16 infomask2)
    2470                 : {
    2471 ECB             :     return
    2472 GIC     2034567 :         ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
    2473         2034567 :         ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
    2474         2034567 :         ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
    2475                 :     /* note we ignore HEAP_XMAX_SHR_LOCK here */
    2476         4069134 :         ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
    2477 ECB             :         ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
    2478 CBC     2034567 :          XLHL_KEYS_UPDATED : 0);
    2479                 : }
    2480                 : 
    2481                 : /*
    2482 ECB             :  * Given two versions of the same t_infomask for a tuple, compare them and
    2483                 :  * return whether the relevant status for a tuple Xmax has changed.  This is
    2484                 :  * used after a buffer lock has been released and reacquired: we want to ensure
    2485                 :  * that the tuple state continues to be the same it was when we previously
    2486                 :  * examined it.
    2487                 :  *
    2488                 :  * Note the Xmax field itself must be compared separately.
    2489                 :  */
    2490                 : static inline bool
    2491 GIC        5303 : xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
    2492                 : {
    2493            5303 :     const uint16 interesting =
    2494                 :     HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
    2495 ECB             : 
    2496 CBC        5303 :     if ((new_infomask & interesting) != (old_infomask & interesting))
    2497              14 :         return true;
    2498                 : 
    2499 GBC        5289 :     return false;
    2500                 : }
    2501                 : 
    2502                 : /*
    2503                 :  *  heap_delete - delete a tuple
    2504                 :  *
    2505                 :  * See table_tuple_delete() for an explanation of the parameters, except that
    2506                 :  * this routine directly takes a tuple rather than a slot.
    2507                 :  *
    2508                 :  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
    2509                 :  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
    2510                 :  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
    2511                 :  * generated by another transaction).
    2512 ECB             :  */
    2513                 : TM_Result
    2514 GIC     1420155 : heap_delete(Relation relation, ItemPointer tid,
    2515                 :             CommandId cid, Snapshot crosscheck, bool wait,
    2516                 :             TM_FailureData *tmfd, bool changingPart)
    2517                 : {
    2518 ECB             :     TM_Result   result;
    2519 CBC     1420155 :     TransactionId xid = GetCurrentTransactionId();
    2520                 :     ItemId      lp;
    2521 ECB             :     HeapTupleData tp;
    2522                 :     Page        page;
    2523                 :     BlockNumber block;
    2524                 :     Buffer      buffer;
    2525 GIC     1420155 :     Buffer      vmbuffer = InvalidBuffer;
    2526                 :     TransactionId new_xmax;
    2527                 :     uint16      new_infomask,
    2528                 :                 new_infomask2;
    2529         1420155 :     bool        have_tuple_lock = false;
    2530                 :     bool        iscombo;
    2531         1420155 :     bool        all_visible_cleared = false;
    2532 CBC     1420155 :     HeapTuple   old_key_tuple = NULL;   /* replica identity of the tuple */
    2533         1420155 :     bool        old_key_copied = false;
    2534 ECB             : 
    2535 GIC     1420155 :     Assert(ItemPointerIsValid(tid));
    2536 ECB             : 
    2537                 :     /*
    2538                 :      * Forbid this during a parallel operation, lest it allocate a combo CID.
    2539                 :      * Other workers might need that combo CID for visibility checks, and we
    2540                 :      * have no provision for broadcasting it to them.
    2541                 :      */
    2542 GIC     1420155 :     if (IsInParallelMode())
    2543 UIC           0 :         ereport(ERROR,
    2544                 :                 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
    2545                 :                  errmsg("cannot delete tuples during a parallel operation")));
    2546 ECB             : 
    2547 CBC     1420155 :     block = ItemPointerGetBlockNumber(tid);
    2548         1420155 :     buffer = ReadBuffer(relation, block);
    2549         1420155 :     page = BufferGetPage(buffer);
    2550 ECB             : 
    2551                 :     /*
    2552                 :      * Before locking the buffer, pin the visibility map page if it appears to
    2553                 :      * be necessary.  Since we haven't got the lock yet, someone else might be
    2554                 :      * in the middle of changing this, so we'll need to recheck after we have
    2555                 :      * the lock.
    2556                 :      */
    2557 GIC     1420155 :     if (PageIsAllVisible(page))
    2558            1022 :         visibilitymap_pin(relation, block, &vmbuffer);
    2559 EUB             : 
    2560 GBC     1420155 :     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    2561                 : 
    2562 GIC     1420155 :     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
    2563 CBC     1420155 :     Assert(ItemIdIsNormal(lp));
    2564                 : 
    2565         1420155 :     tp.t_tableOid = RelationGetRelid(relation);
    2566 GIC     1420155 :     tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
    2567         1420155 :     tp.t_len = ItemIdGetLength(lp);
    2568         1420155 :     tp.t_self = *tid;
    2569 ECB             : 
    2570 CBC           1 : l1:
    2571                 : 
    2572                 :     /*
    2573 ECB             :      * If we didn't pin the visibility map page and the page has become all
    2574                 :      * visible while we were busy locking the buffer, we'll have to unlock and
    2575                 :      * re-lock, to avoid holding the buffer lock across an I/O.  That's a bit
    2576                 :      * unfortunate, but hopefully shouldn't happen often.
    2577                 :      */
    2578 CBC     1420156 :     if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
    2579 ECB             :     {
    2580 LBC           0 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    2581               0 :         visibilitymap_pin(relation, block, &vmbuffer);
    2582               0 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    2583 EUB             :     }
    2584 ECB             : 
    2585 GIC     1420156 :     result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
    2586                 : 
    2587         1420156 :     if (result == TM_Invisible)
    2588                 :     {
    2589 UIC           0 :         UnlockReleaseBuffer(buffer);
    2590               0 :         ereport(ERROR,
    2591                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    2592                 :                  errmsg("attempted to delete invisible tuple")));
    2593                 :     }
    2594 GIC     1420156 :     else if (result == TM_BeingModified && wait)
    2595                 :     {
    2596 ECB             :         TransactionId xwait;
    2597                 :         uint16      infomask;
    2598                 : 
    2599                 :         /* must copy state data before unlocking buffer */
    2600 GIC       40518 :         xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
    2601           40518 :         infomask = tp.t_data->t_infomask;
    2602                 : 
    2603                 :         /*
    2604                 :          * Sleep until concurrent transaction ends -- except when there's a
    2605 ECB             :          * single locker and it's our own transaction.  Note we don't care
    2606                 :          * which lock mode the locker has, because we need the strongest one.
    2607                 :          *
    2608                 :          * Before sleeping, we need to acquire tuple lock to establish our
    2609                 :          * priority for the tuple (see heap_lock_tuple).  LockTuple will
    2610                 :          * release us when we are next-in-line for the tuple.
    2611                 :          *
    2612                 :          * If we are forced to "start over" below, we keep the tuple lock;
    2613                 :          * this arranges that we stay at the head of the line while rechecking
    2614                 :          * tuple state.
    2615                 :          */
    2616 GIC       40518 :         if (infomask & HEAP_XMAX_IS_MULTI)
    2617 ECB             :         {
    2618 CBC           8 :             bool        current_is_member = false;
    2619                 : 
    2620 GIC           8 :             if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
    2621                 :                                         LockTupleExclusive, &current_is_member))
    2622 ECB             :             {
    2623 GIC           8 :                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    2624                 : 
    2625                 :                 /*
    2626                 :                  * Acquire the lock, if necessary (but skip it when we're
    2627                 :                  * requesting a lock and already have one; avoids deadlock).
    2628                 :                  */
    2629               8 :                 if (!current_is_member)
    2630               6 :                     heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
    2631 ECB             :                                          LockWaitBlock, &have_tuple_lock);
    2632                 : 
    2633                 :                 /* wait for multixact */
    2634 GIC           8 :                 MultiXactIdWait((MultiXactId) xwait, MultiXactStatusUpdate, infomask,
    2635 ECB             :                                 relation, &(tp.t_self), XLTW_Delete,
    2636                 :                                 NULL);
    2637 CBC           8 :                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    2638                 : 
    2639                 :                 /*
    2640                 :                  * If xwait had just locked the tuple then some other xact
    2641                 :                  * could update this tuple before we get to this point.  Check
    2642 ECB             :                  * for xmax change, and start over if so.
    2643                 :                  *
    2644                 :                  * We also must start over if we didn't pin the VM page, and
    2645                 :                  * the page has become all visible.
    2646                 :                  */
    2647 CBC          16 :                 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
    2648               8 :                     xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
    2649 GIC           8 :                     !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
    2650 ECB             :                                          xwait))
    2651 UIC           0 :                     goto l1;
    2652                 :             }
    2653 ECB             : 
    2654                 :             /*
    2655                 :              * You might think the multixact is necessarily done here, but not
    2656                 :              * so: it could have surviving members, namely our own xact or
    2657                 :              * other subxacts of this backend.  It is legal for us to delete
    2658                 :              * the tuple in either case, however (the latter case is
    2659                 :              * essentially a situation of upgrading our former shared lock to
    2660                 :              * exclusive).  We don't bother changing the on-disk hint bits
    2661                 :              * since we are about to overwrite the xmax altogether.
    2662                 :              */
    2663                 :         }
    2664 CBC       40510 :         else if (!TransactionIdIsCurrentTransactionId(xwait))
    2665                 :         {
    2666                 :             /*
    2667                 :              * Wait for regular transaction to end; but first, acquire tuple
    2668                 :              * lock.
    2669                 :              */
    2670 GIC          36 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    2671              36 :             heap_acquire_tuplock(relation, &(tp.t_self), LockTupleExclusive,
    2672                 :                                  LockWaitBlock, &have_tuple_lock);
    2673              36 :             XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
    2674 CBC          32 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    2675 ECB             : 
    2676                 :             /*
    2677                 :              * xwait is done, but if xwait had just locked the tuple then some
    2678                 :              * other xact could update this tuple before we get to this point.
    2679                 :              * Check for xmax change, and start over if so.
    2680                 :              *
    2681                 :              * We also must start over if we didn't pin the VM page, and the
    2682                 :              * page has become all visible.
    2683                 :              */
    2684 CBC          64 :             if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
    2685              32 :                 xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
    2686 GIC          31 :                 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tp.t_data),
    2687 ECB             :                                      xwait))
    2688 GIC           1 :                 goto l1;
    2689 ECB             : 
    2690                 :             /* Otherwise check if it committed or aborted */
    2691 GIC          31 :             UpdateXmaxHintBits(tp.t_data, buffer, xwait);
    2692 ECB             :         }
    2693                 : 
    2694                 :         /*
    2695                 :          * We may overwrite if previous xmax aborted, or if it committed but
    2696                 :          * only locked the tuple without updating it.
    2697                 :          */
    2698 CBC       40513 :         if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
    2699 GIC       40524 :             HEAP_XMAX_IS_LOCKED_ONLY(tp.t_data->t_infomask) ||
    2700              23 :             HeapTupleHeaderIsOnlyLocked(tp.t_data))
    2701           40494 :             result = TM_Ok;
    2702              19 :         else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
    2703 CBC          15 :             result = TM_Updated;
    2704                 :         else
    2705               4 :             result = TM_Deleted;
    2706 ECB             :     }
    2707                 : 
    2708 GIC     1420151 :     if (crosscheck != InvalidSnapshot && result == TM_Ok)
    2709 ECB             :     {
    2710                 :         /* Perform additional check for transaction-snapshot mode RI updates */
    2711 UIC           0 :         if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
    2712 LBC           0 :             result = TM_Updated;
    2713                 :     }
    2714                 : 
    2715 GIC     1420151 :     if (result != TM_Ok)
    2716                 :     {
    2717 CBC          39 :         Assert(result == TM_SelfModified ||
    2718                 :                result == TM_Updated ||
    2719 ECB             :                result == TM_Deleted ||
    2720                 :                result == TM_BeingModified);
    2721 CBC          39 :         Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID));
    2722 GIC          39 :         Assert(result != TM_Updated ||
    2723                 :                !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
    2724 CBC          39 :         tmfd->ctid = tp.t_data->t_ctid;
    2725 GIC          39 :         tmfd->xmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
    2726 CBC          39 :         if (result == TM_SelfModified)
    2727 GIC          18 :             tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
    2728 ECB             :         else
    2729 CBC          21 :             tmfd->cmax = InvalidCommandId;
    2730 GIC          39 :         UnlockReleaseBuffer(buffer);
    2731              39 :         if (have_tuple_lock)
    2732              19 :             UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
    2733              39 :         if (vmbuffer != InvalidBuffer)
    2734 UIC           0 :             ReleaseBuffer(vmbuffer);
    2735 GIC          39 :         return result;
    2736                 :     }
    2737 ECB             : 
    2738                 :     /*
    2739                 :      * We're about to do the actual delete -- check for conflict first, to
    2740                 :      * avoid possibly having to roll back work we've just done.
    2741                 :      *
    2742                 :      * This is safe without a recheck as long as there is no possibility of
    2743                 :      * another process scanning the page between this check and the delete
    2744                 :      * being visible to the scan (i.e., an exclusive buffer content lock is
    2745                 :      * continuously held from this point until the tuple delete is visible).
    2746                 :      */
    2747 GIC     1420112 :     CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
    2748                 : 
    2749                 :     /* replace cid with a combo CID if necessary */
    2750         1420098 :     HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
    2751 ECB             : 
    2752                 :     /*
    2753                 :      * Compute replica identity tuple before entering the critical section so
    2754                 :      * we don't PANIC upon a memory allocation failure.
    2755                 :      */
    2756 GIC     1420098 :     old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
    2757                 : 
    2758                 :     /*
    2759 ECB             :      * If this is the first possibly-multixact-able operation in the current
    2760                 :      * transaction, set my per-backend OldestMemberMXactId setting. We can be
    2761                 :      * certain that the transaction will never become a member of any older
    2762                 :      * MultiXactIds than that.  (We have to do this even if we end up just
    2763                 :      * using our own TransactionId below, since some other backend could
    2764                 :      * incorporate our XID into a MultiXact immediately afterwards.)
    2765                 :      */
    2766 GIC     1420098 :     MultiXactIdSetOldestMember();
    2767 ECB             : 
    2768 GIC     1420098 :     compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(tp.t_data),
    2769         1420098 :                               tp.t_data->t_infomask, tp.t_data->t_infomask2,
    2770                 :                               xid, LockTupleExclusive, true,
    2771                 :                               &new_xmax, &new_infomask, &new_infomask2);
    2772                 : 
    2773         1420098 :     START_CRIT_SECTION();
    2774                 : 
    2775                 :     /*
    2776                 :      * If this transaction commits, the tuple will become DEAD sooner or
    2777                 :      * later.  Set flag that this page is a candidate for pruning once our xid
    2778                 :      * falls below the OldestXmin horizon.  If the transaction finally aborts,
    2779 ECB             :      * the subsequent page pruning will be a no-op and the hint will be
    2780                 :      * cleared.
    2781                 :      */
    2782 GIC     1420098 :     PageSetPrunable(page, xid);
    2783                 : 
    2784 CBC     1420098 :     if (PageIsAllVisible(page))
    2785                 :     {
    2786 GIC        1022 :         all_visible_cleared = true;
    2787            1022 :         PageClearAllVisible(page);
    2788 CBC        1022 :         visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
    2789                 :                             vmbuffer, VISIBILITYMAP_VALID_BITS);
    2790 EUB             :     }
    2791                 : 
    2792                 :     /* store transaction information of xact deleting the tuple */
    2793 GIC     1420098 :     tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    2794         1420098 :     tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    2795 CBC     1420098 :     tp.t_data->t_infomask |= new_infomask;
    2796 GIC     1420098 :     tp.t_data->t_infomask2 |= new_infomask2;
    2797 CBC     1420098 :     HeapTupleHeaderClearHotUpdated(tp.t_data);
    2798 GIC     1420098 :     HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
    2799 GBC     1420098 :     HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
    2800 EUB             :     /* Make sure there is no forward chain link in t_ctid */
    2801 GIC     1420098 :     tp.t_data->t_ctid = tp.t_self;
    2802                 : 
    2803 EUB             :     /* Signal that this is actually a move into another partition */
    2804 GBC     1420098 :     if (changingPart)
    2805 GIC         404 :         HeapTupleHeaderSetMovedPartitions(tp.t_data);
    2806                 : 
    2807 GBC     1420098 :     MarkBufferDirty(buffer);
    2808 EUB             : 
    2809                 :     /*
    2810                 :      * XLOG stuff
    2811 ECB             :      *
    2812                 :      * NB: heap_abort_speculative() uses the same xlog record and replay
    2813                 :      * routines.
    2814                 :      */
    2815 GIC     1420098 :     if (RelationNeedsWAL(relation))
    2816                 :     {
    2817                 :         xl_heap_delete xlrec;
    2818                 :         xl_heap_header xlhdr;
    2819                 :         XLogRecPtr  recptr;
    2820                 : 
    2821                 :         /*
    2822                 :          * For logical decode we need combo CIDs to properly decode the
    2823                 :          * catalog
    2824                 :          */
    2825 CBC     1359543 :         if (RelationIsAccessibleInLogicalDecoding(relation))
    2826 GIC        6005 :             log_heap_new_cid(relation, &tp);
    2827                 : 
    2828         1359543 :         xlrec.flags = 0;
    2829         1359543 :         if (all_visible_cleared)
    2830            1022 :             xlrec.flags |= XLH_DELETE_ALL_VISIBLE_CLEARED;
    2831 CBC     1359543 :         if (changingPart)
    2832 GIC         404 :             xlrec.flags |= XLH_DELETE_IS_PARTITION_MOVE;
    2833         2719086 :         xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
    2834         1359543 :                                               tp.t_data->t_infomask2);
    2835         1359543 :         xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
    2836         1359543 :         xlrec.xmax = new_xmax;
    2837                 : 
    2838         1359543 :         if (old_key_tuple != NULL)
    2839                 :         {
    2840           82320 :             if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
    2841 CBC         221 :                 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_TUPLE;
    2842 ECB             :             else
    2843 GIC       82099 :                 xlrec.flags |= XLH_DELETE_CONTAINS_OLD_KEY;
    2844                 :         }
    2845                 : 
    2846         1359543 :         XLogBeginInsert();
    2847         1359543 :         XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
    2848 ECB             : 
    2849 CBC     1359543 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    2850                 : 
    2851                 :         /*
    2852                 :          * Log replica identity of the deleted tuple if there is one
    2853 ECB             :          */
    2854 GIC     1359543 :         if (old_key_tuple != NULL)
    2855 ECB             :         {
    2856 CBC       82320 :             xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
    2857 GIC       82320 :             xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
    2858 CBC       82320 :             xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
    2859 ECB             : 
    2860 GIC       82320 :             XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
    2861           82320 :             XLogRegisterData((char *) old_key_tuple->t_data
    2862 ECB             :                              + SizeofHeapTupleHeader,
    2863 GIC       82320 :                              old_key_tuple->t_len
    2864                 :                              - SizeofHeapTupleHeader);
    2865                 :         }
    2866                 : 
    2867                 :         /* filtering by origin on a row level is much more efficient */
    2868         1359543 :         XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
    2869                 : 
    2870 CBC     1359543 :         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
    2871                 : 
    2872 GIC     1359543 :         PageSetLSN(page, recptr);
    2873 ECB             :     }
    2874                 : 
    2875 GIC     1420098 :     END_CRIT_SECTION();
    2876                 : 
    2877         1420098 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    2878                 : 
    2879         1420098 :     if (vmbuffer != InvalidBuffer)
    2880            1022 :         ReleaseBuffer(vmbuffer);
    2881 ECB             : 
    2882 EUB             :     /*
    2883                 :      * If the tuple has toasted out-of-line attributes, we need to delete
    2884                 :      * those items too.  We have to do this before releasing the buffer
    2885                 :      * because we need to look at the contents of the tuple, but it's OK to
    2886                 :      * release the content lock on the buffer first.
    2887                 :      */
    2888 GIC     1420098 :     if (relation->rd_rel->relkind != RELKIND_RELATION &&
    2889            1569 :         relation->rd_rel->relkind != RELKIND_MATVIEW)
    2890                 :     {
    2891                 :         /* toast table entries should never be recursively toasted */
    2892            1559 :         Assert(!HeapTupleHasExternal(&tp));
    2893                 :     }
    2894         1418539 :     else if (HeapTupleHasExternal(&tp))
    2895             233 :         heap_toast_delete(relation, &tp, false);
    2896                 : 
    2897                 :     /*
    2898                 :      * Mark tuple for invalidation from system caches at next command
    2899                 :      * boundary. We have to do this before releasing the buffer because we
    2900                 :      * need to look at the contents of the tuple.
    2901                 :      */
    2902 CBC     1420098 :     CacheInvalidateHeapTuple(relation, &tp, NULL);
    2903                 : 
    2904 ECB             :     /* Now we can release the buffer */
    2905 GIC     1420098 :     ReleaseBuffer(buffer);
    2906 ECB             : 
    2907                 :     /*
    2908                 :      * Release the lmgr tuple lock, if we had it.
    2909                 :      */
    2910 CBC     1420098 :     if (have_tuple_lock)
    2911              18 :         UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
    2912 ECB             : 
    2913 CBC     1420098 :     pgstat_count_heap_delete(relation);
    2914                 : 
    2915         1420098 :     if (old_key_tuple != NULL && old_key_copied)
    2916           82100 :         heap_freetuple(old_key_tuple);
    2917 ECB             : 
    2918 GIC     1420098 :     return TM_Ok;
    2919                 : }
    2920                 : 
    2921                 : /*
    2922                 :  *  simple_heap_delete - delete a tuple
    2923                 :  *
    2924                 :  * This routine may be used to delete a tuple when concurrent updates of
    2925 ECB             :  * the target tuple are not expected (for example, because we have a lock
    2926                 :  * on the relation associated with the tuple).  Any failure is reported
    2927                 :  * via ereport().
    2928                 :  */
    2929                 : void
    2930 CBC      536249 : simple_heap_delete(Relation relation, ItemPointer tid)
    2931 ECB             : {
    2932                 :     TM_Result   result;
    2933                 :     TM_FailureData tmfd;
    2934                 : 
    2935 GIC      536249 :     result = heap_delete(relation, tid,
    2936                 :                          GetCurrentCommandId(true), InvalidSnapshot,
    2937 ECB             :                          true /* wait for commit */ ,
    2938                 :                          &tmfd, false /* changingPart */ );
    2939 CBC      536249 :     switch (result)
    2940 ECB             :     {
    2941 UIC           0 :         case TM_SelfModified:
    2942                 :             /* Tuple was already updated in current command? */
    2943 LBC           0 :             elog(ERROR, "tuple already updated by self");
    2944                 :             break;
    2945                 : 
    2946 GIC      536249 :         case TM_Ok:
    2947                 :             /* done successfully */
    2948          536249 :             break;
    2949                 : 
    2950 UIC           0 :         case TM_Updated:
    2951               0 :             elog(ERROR, "tuple concurrently updated");
    2952                 :             break;
    2953 ECB             : 
    2954 UIC           0 :         case TM_Deleted:
    2955               0 :             elog(ERROR, "tuple concurrently deleted");
    2956                 :             break;
    2957                 : 
    2958               0 :         default:
    2959               0 :             elog(ERROR, "unrecognized heap_delete status: %u", result);
    2960                 :             break;
    2961                 :     }
    2962 GIC      536249 : }
    2963                 : 
    2964                 : /*
    2965                 :  *  heap_update - replace a tuple
    2966                 :  *
    2967                 :  * See table_tuple_update() for an explanation of the parameters, except that
    2968 ECB             :  * this routine directly takes a tuple rather than a slot.
    2969                 :  *
    2970                 :  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
    2971                 :  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
    2972                 :  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
    2973                 :  * generated by another transaction).
    2974                 :  */
    2975                 : TM_Result
    2976 GIC      418994 : heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
    2977                 :             CommandId cid, Snapshot crosscheck, bool wait,
    2978                 :             TM_FailureData *tmfd, LockTupleMode *lockmode,
    2979                 :             TU_UpdateIndexes *update_indexes)
    2980                 : {
    2981                 :     TM_Result   result;
    2982          418994 :     TransactionId xid = GetCurrentTransactionId();
    2983                 :     Bitmapset  *hot_attrs;
    2984                 :     Bitmapset  *sum_attrs;
    2985 ECB             :     Bitmapset  *key_attrs;
    2986                 :     Bitmapset  *id_attrs;
    2987                 :     Bitmapset  *interesting_attrs;
    2988                 :     Bitmapset  *modified_attrs;
    2989                 :     ItemId      lp;
    2990                 :     HeapTupleData oldtup;
    2991                 :     HeapTuple   heaptup;
    2992 GIC      418994 :     HeapTuple   old_key_tuple = NULL;
    2993          418994 :     bool        old_key_copied = false;
    2994                 :     Page        page;
    2995                 :     BlockNumber block;
    2996                 :     MultiXactStatus mxact_status;
    2997                 :     Buffer      buffer,
    2998                 :                 newbuf,
    2999          418994 :                 vmbuffer = InvalidBuffer,
    3000          418994 :                 vmbuffer_new = InvalidBuffer;
    3001 ECB             :     bool        need_toast;
    3002                 :     Size        newtupsize,
    3003                 :                 pagefree;
    3004 CBC      418994 :     bool        have_tuple_lock = false;
    3005                 :     bool        iscombo;
    3006 GIC      418994 :     bool        use_hot_update = false;
    3007 GNC      418994 :     bool        summarized_update = false;
    3008 ECB             :     bool        key_intact;
    3009 GIC      418994 :     bool        all_visible_cleared = false;
    3010 CBC      418994 :     bool        all_visible_cleared_new = false;
    3011                 :     bool        checked_lockers;
    3012 EUB             :     bool        locker_remains;
    3013 GBC      418994 :     bool        id_has_external = false;
    3014                 :     TransactionId xmax_new_tuple,
    3015                 :                 xmax_old_tuple;
    3016                 :     uint16      infomask_old_tuple,
    3017 ECB             :                 infomask2_old_tuple,
    3018                 :                 infomask_new_tuple,
    3019                 :                 infomask2_new_tuple;
    3020                 : 
    3021 CBC      418994 :     Assert(ItemPointerIsValid(otid));
    3022                 : 
    3023                 :     /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
    3024 GIC      418994 :     Assert(HeapTupleHeaderGetNatts(newtup->t_data) <=
    3025                 :            RelationGetNumberOfAttributes(relation));
    3026                 : 
    3027                 :     /*
    3028                 :      * Forbid this during a parallel operation, lest it allocate a combo CID.
    3029                 :      * Other workers might need that combo CID for visibility checks, and we
    3030                 :      * have no provision for broadcasting it to them.
    3031                 :      */
    3032          418994 :     if (IsInParallelMode())
    3033 UIC           0 :         ereport(ERROR,
    3034                 :                 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
    3035                 :                  errmsg("cannot update tuples during a parallel operation")));
    3036                 : 
    3037 ECB             :     /*
    3038                 :      * Fetch the list of attributes to be checked for various operations.
    3039                 :      *
    3040                 :      * For HOT considerations, this is wasted effort if we fail to update or
    3041                 :      * have to put the new tuple on a different page.  But we must compute the
    3042                 :      * list before obtaining buffer lock --- in the worst case, if we are
    3043                 :      * doing an update on one of the relevant system catalogs, we could
    3044                 :      * deadlock if we try to fetch the list later.  In any case, the relcache
    3045                 :      * caches the data so this is usually pretty cheap.
    3046                 :      *
    3047                 :      * We also need columns used by the replica identity and columns that are
    3048                 :      * considered the "key" of rows in the table.
    3049                 :      *
    3050                 :      * Note that we get copies of each bitmap, so we need not worry about
    3051                 :      * relcache flush happening midway through.
    3052                 :      */
    3053 GNC      418994 :     hot_attrs = RelationGetIndexAttrBitmap(relation,
    3054                 :                                            INDEX_ATTR_BITMAP_HOT_BLOCKING);
    3055          418994 :     sum_attrs = RelationGetIndexAttrBitmap(relation,
    3056                 :                                            INDEX_ATTR_BITMAP_SUMMARIZED);
    3057 GIC      418994 :     key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
    3058          418994 :     id_attrs = RelationGetIndexAttrBitmap(relation,
    3059                 :                                           INDEX_ATTR_BITMAP_IDENTITY_KEY);
    3060          418994 :     interesting_attrs = NULL;
    3061          418994 :     interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
    3062 GNC      418994 :     interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
    3063 GIC      418994 :     interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
    3064          418994 :     interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
    3065                 : 
    3066 CBC      418994 :     block = ItemPointerGetBlockNumber(otid);
    3067 GIC      418994 :     buffer = ReadBuffer(relation, block);
    3068          418994 :     page = BufferGetPage(buffer);
    3069                 : 
    3070 ECB             :     /*
    3071                 :      * Before locking the buffer, pin the visibility map page if it appears to
    3072                 :      * be necessary.  Since we haven't got the lock yet, someone else might be
    3073                 :      * in the middle of changing this, so we'll need to recheck after we have
    3074                 :      * the lock.
    3075                 :      */
    3076 GIC      418994 :     if (PageIsAllVisible(page))
    3077             961 :         visibilitymap_pin(relation, block, &vmbuffer);
    3078                 : 
    3079          418994 :     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    3080                 : 
    3081 CBC      418994 :     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
    3082 GBC      418994 :     Assert(ItemIdIsNormal(lp));
    3083                 : 
    3084                 :     /*
    3085                 :      * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
    3086 ECB             :      * properly.
    3087                 :      */
    3088 GIC      418994 :     oldtup.t_tableOid = RelationGetRelid(relation);
    3089 CBC      418994 :     oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
    3090          418994 :     oldtup.t_len = ItemIdGetLength(lp);
    3091          418994 :     oldtup.t_self = *otid;
    3092                 : 
    3093                 :     /* the new tuple is ready, except for this: */
    3094 GIC      418994 :     newtup->t_tableOid = RelationGetRelid(relation);
    3095                 : 
    3096                 :     /*
    3097                 :      * Determine columns modified by the update.  Additionally, identify
    3098 ECB             :      * whether any of the unmodified replica identity key attributes in the
    3099                 :      * old tuple is externally stored or not.  This is required because for
    3100                 :      * such attributes the flattened value won't be WAL logged as part of the
    3101                 :      * new tuple so we must include it as part of the old_key_tuple.  See
    3102 EUB             :      * ExtractReplicaIdentity.
    3103                 :      */
    3104 GIC      418994 :     modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
    3105                 :                                               id_attrs, &oldtup,
    3106                 :                                               newtup, &id_has_external);
    3107                 : 
    3108                 :     /*
    3109                 :      * If we're not updating any "key" column, we can grab a weaker lock type.
    3110                 :      * This allows for more concurrency when we are running simultaneously
    3111                 :      * with foreign key checks.
    3112                 :      *
    3113                 :      * Note that if a column gets detoasted while executing the update, but
    3114                 :      * the value ends up being the same, this test will fail and we will use
    3115                 :      * the stronger lock.  This is acceptable; the important case to optimize
    3116                 :      * is updates that don't manipulate key columns, not those that
    3117                 :      * serendipitously arrive at the same key values.
    3118                 :      */
    3119          418994 :     if (!bms_overlap(modified_attrs, key_attrs))
    3120                 :     {
    3121          415546 :         *lockmode = LockTupleNoKeyExclusive;
    3122          415546 :         mxact_status = MultiXactStatusNoKeyUpdate;
    3123          415546 :         key_intact = true;
    3124                 : 
    3125 ECB             :         /*
    3126                 :          * If this is the first possibly-multixact-able operation in the
    3127                 :          * current transaction, set my per-backend OldestMemberMXactId
    3128                 :          * setting. We can be certain that the transaction will never become a
    3129                 :          * member of any older MultiXactIds than that.  (We have to do this
    3130                 :          * even if we end up just using our own TransactionId below, since
    3131                 :          * some other backend could incorporate our XID into a MultiXact
    3132                 :          * immediately afterwards.)
    3133                 :          */
    3134 GIC      415546 :         MultiXactIdSetOldestMember();
    3135 ECB             :     }
    3136                 :     else
    3137                 :     {
    3138 GIC        3448 :         *lockmode = LockTupleExclusive;
    3139 CBC        3448 :         mxact_status = MultiXactStatusUpdate;
    3140 GIC        3448 :         key_intact = false;
    3141                 :     }
    3142                 : 
    3143                 :     /*
    3144                 :      * Note: beyond this point, use oldtup not otid to refer to old tuple.
    3145 ECB             :      * otid may very well point at newtup->t_self, which we will overwrite
    3146                 :      * with the new tuple's location, so there's great risk of confusion if we
    3147                 :      * use otid anymore.
    3148                 :      */
    3149                 : 
    3150 GIC      418994 : l2:
    3151          418995 :     checked_lockers = false;
    3152          418995 :     locker_remains = false;
    3153          418995 :     result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
    3154                 : 
    3155                 :     /* see below about the "no wait" case */
    3156 CBC      418995 :     Assert(result != TM_BeingModified || wait);
    3157 ECB             : 
    3158 CBC      418995 :     if (result == TM_Invisible)
    3159                 :     {
    3160 UIC           0 :         UnlockReleaseBuffer(buffer);
    3161               0 :         ereport(ERROR,
    3162                 :                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
    3163                 :                  errmsg("attempted to update invisible tuple")));
    3164                 :     }
    3165 GIC      418995 :     else if (result == TM_BeingModified && wait)
    3166 ECB             :     {
    3167                 :         TransactionId xwait;
    3168                 :         uint16      infomask;
    3169 CBC       35847 :         bool        can_continue = false;
    3170                 : 
    3171 ECB             :         /*
    3172                 :          * XXX note that we don't consider the "no wait" case here.  This
    3173                 :          * isn't a problem currently because no caller uses that case, but it
    3174                 :          * should be fixed if such a caller is introduced.  It wasn't a
    3175                 :          * problem previously because this code would always wait, but now
    3176                 :          * that some tuple locks do not conflict with one of the lock modes we
    3177                 :          * use, it is possible that this case is interesting to handle
    3178                 :          * specially.
    3179                 :          *
    3180                 :          * This may cause failures with third-party code that calls
    3181                 :          * heap_update directly.
    3182                 :          */
    3183                 : 
    3184                 :         /* must copy state data before unlocking buffer */
    3185 CBC       35847 :         xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
    3186           35847 :         infomask = oldtup.t_data->t_infomask;
    3187 ECB             : 
    3188                 :         /*
    3189                 :          * Now we have to do something about the existing locker.  If it's a
    3190                 :          * multi, sleep on it; we might be awakened before it is completely
    3191                 :          * gone (or even not sleep at all in some cases); we need to preserve
    3192                 :          * it as locker, unless it is gone completely.
    3193                 :          *
    3194                 :          * If it's not a multi, we need to check for sleeping conditions
    3195                 :          * before actually going to sleep.  If the update doesn't conflict
    3196                 :          * with the locks, we just continue without sleeping (but making sure
    3197                 :          * it is preserved).
    3198                 :          *
    3199                 :          * Before sleeping, we need to acquire tuple lock to establish our
    3200                 :          * priority for the tuple (see heap_lock_tuple).  LockTuple will
    3201 EUB             :          * release us when we are next-in-line for the tuple.  Note we must
    3202                 :          * not acquire the tuple lock until we're sure we're going to sleep;
    3203                 :          * otherwise we're open for race conditions with other transactions
    3204                 :          * holding the tuple lock which sleep on us.
    3205                 :          *
    3206                 :          * If we are forced to "start over" below, we keep the tuple lock;
    3207                 :          * this arranges that we stay at the head of the line while rechecking
    3208 ECB             :          * tuple state.
    3209                 :          */
    3210 CBC       35847 :         if (infomask & HEAP_XMAX_IS_MULTI)
    3211                 :         {
    3212                 :             TransactionId update_xact;
    3213                 :             int         remain;
    3214              60 :             bool        current_is_member = false;
    3215 ECB             : 
    3216 GIC          60 :             if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
    3217 ECB             :                                         *lockmode, &current_is_member))
    3218                 :             {
    3219 CBC           8 :                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3220 ECB             : 
    3221                 :                 /*
    3222                 :                  * Acquire the lock, if necessary (but skip it when we're
    3223                 :                  * requesting a lock and already have one; avoids deadlock).
    3224                 :                  */
    3225 CBC           8 :                 if (!current_is_member)
    3226 LBC           0 :                     heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
    3227 EUB             :                                          LockWaitBlock, &have_tuple_lock);
    3228 ECB             : 
    3229                 :                 /* wait for multixact */
    3230 CBC           8 :                 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
    3231 ECB             :                                 relation, &oldtup.t_self, XLTW_Update,
    3232                 :                                 &remain);
    3233 CBC           8 :                 checked_lockers = true;
    3234               8 :                 locker_remains = remain != 0;
    3235               8 :                 LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    3236 ECB             : 
    3237                 :                 /*
    3238                 :                  * If xwait had just locked the tuple then some other xact
    3239                 :                  * could update this tuple before we get to this point.  Check
    3240                 :                  * for xmax change, and start over if so.
    3241                 :                  */
    3242 GIC           8 :                 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
    3243               8 :                                           infomask) ||
    3244               8 :                     !TransactionIdEquals(HeapTupleHeaderGetRawXmax(oldtup.t_data),
    3245                 :                                          xwait))
    3246 UIC           0 :                     goto l2;
    3247                 :             }
    3248 ECB             : 
    3249                 :             /*
    3250 EUB             :              * Note that the multixact may not be done by now.  It could have
    3251                 :              * surviving members; our own xact or other subxacts of this
    3252                 :              * backend, and also any other concurrent transaction that locked
    3253                 :              * the tuple with LockTupleKeyShare if we only got
    3254                 :              * LockTupleNoKeyExclusive.  If this is the case, we have to be
    3255                 :              * careful to mark the updated tuple with the surviving members in
    3256                 :              * Xmax.
    3257                 :              *
    3258                 :              * Note that there could have been another update in the
    3259                 :              * MultiXact. In that case, we need to check whether it committed
    3260                 :              * or aborted. If it aborted we are safe to update it again;
    3261                 :              * otherwise there is an update conflict, and we have to return
    3262 ECB             :              * TableTuple{Deleted, Updated} below.
    3263                 :              *
    3264                 :              * In the LockTupleExclusive case, we still need to preserve the
    3265                 :              * surviving members: those would include the tuple locks we had
    3266                 :              * before this one, which are important to keep in case this
    3267                 :              * subxact aborts.
    3268                 :              */
    3269 GIC          60 :             if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
    3270               8 :                 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
    3271                 :             else
    3272              52 :                 update_xact = InvalidTransactionId;
    3273                 : 
    3274                 :             /*
    3275                 :              * There was no UPDATE in the MultiXact; or it aborted. No
    3276 ECB             :              * TransactionIdIsInProgress() call needed here, since we called
    3277                 :              * MultiXactIdWait() above.
    3278                 :              */
    3279 CBC          68 :             if (!TransactionIdIsValid(update_xact) ||
    3280 GIC           8 :                 TransactionIdDidAbort(update_xact))
    3281 CBC          53 :                 can_continue = true;
    3282                 :         }
    3283           35787 :         else if (TransactionIdIsCurrentTransactionId(xwait))
    3284                 :         {
    3285 ECB             :             /*
    3286                 :              * The only locker is ourselves; we can avoid grabbing the tuple
    3287                 :              * lock here, but must preserve our locking information.
    3288                 :              */
    3289 GIC       35706 :             checked_lockers = true;
    3290           35706 :             locker_remains = true;
    3291           35706 :             can_continue = true;
    3292                 :         }
    3293              81 :         else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
    3294                 :         {
    3295                 :             /*
    3296 ECB             :              * If it's just a key-share locker, and we're not changing the key
    3297                 :              * columns, we don't need to wait for it to end; but we need to
    3298                 :              * preserve it as locker.
    3299                 :              */
    3300 GIC          29 :             checked_lockers = true;
    3301              29 :             locker_remains = true;
    3302              29 :             can_continue = true;
    3303 ECB             :         }
    3304                 :         else
    3305                 :         {
    3306                 :             /*
    3307                 :              * Wait for regular transaction to end; but first, acquire tuple
    3308                 :              * lock.
    3309                 :              */
    3310 GIC          52 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3311              52 :             heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
    3312 ECB             :                                  LockWaitBlock, &have_tuple_lock);
    3313 CBC          52 :             XactLockTableWait(xwait, relation, &oldtup.t_self,
    3314 ECB             :                               XLTW_Update);
    3315 CBC          52 :             checked_lockers = true;
    3316              52 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    3317 ECB             : 
    3318                 :             /*
    3319                 :              * xwait is done, but if xwait had just locked the tuple then some
    3320                 :              * other xact could update this tuple before we get to this point.
    3321                 :              * Check for xmax change, and start over if so.
    3322                 :              */
    3323 GIC          52 :             if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
    3324 CBC          51 :                 !TransactionIdEquals(xwait,
    3325                 :                                      HeapTupleHeaderGetRawXmax(oldtup.t_data)))
    3326 GIC           1 :                 goto l2;
    3327                 : 
    3328                 :             /* Otherwise check if it committed or aborted */
    3329              51 :             UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
    3330              51 :             if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
    3331              12 :                 can_continue = true;
    3332                 :         }
    3333                 : 
    3334           35846 :         if (can_continue)
    3335           35800 :             result = TM_Ok;
    3336              46 :         else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
    3337 CBC          41 :             result = TM_Updated;
    3338 EUB             :         else
    3339 GIC           5 :             result = TM_Deleted;
    3340                 :     }
    3341 EUB             : 
    3342 GBC      418994 :     if (crosscheck != InvalidSnapshot && result == TM_Ok)
    3343 EUB             :     {
    3344                 :         /* Perform additional check for transaction-snapshot mode RI updates */
    3345 UIC           0 :         if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
    3346 ECB             :         {
    3347 LBC           0 :             result = TM_Updated;
    3348               0 :             Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
    3349                 :         }
    3350 ECB             :     }
    3351                 : 
    3352 CBC      418994 :     if (result != TM_Ok)
    3353                 :     {
    3354             114 :         Assert(result == TM_SelfModified ||
    3355 ECB             :                result == TM_Updated ||
    3356                 :                result == TM_Deleted ||
    3357                 :                result == TM_BeingModified);
    3358 GIC         114 :         Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
    3359 CBC         114 :         Assert(result != TM_Updated ||
    3360                 :                !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
    3361 GIC         114 :         tmfd->ctid = oldtup.t_data->t_ctid;
    3362             114 :         tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
    3363             114 :         if (result == TM_SelfModified)
    3364              45 :             tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
    3365                 :         else
    3366              69 :             tmfd->cmax = InvalidCommandId;
    3367             114 :         UnlockReleaseBuffer(buffer);
    3368             114 :         if (have_tuple_lock)
    3369              39 :             UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
    3370             114 :         if (vmbuffer != InvalidBuffer)
    3371 UIC           0 :             ReleaseBuffer(vmbuffer);
    3372 GNC         114 :         *update_indexes = TU_None;
    3373                 : 
    3374 GIC         114 :         bms_free(hot_attrs);
    3375 GNC         114 :         bms_free(sum_attrs);
    3376 GIC         114 :         bms_free(key_attrs);
    3377             114 :         bms_free(id_attrs);
    3378             114 :         bms_free(modified_attrs);
    3379             114 :         bms_free(interesting_attrs);
    3380             114 :         return result;
    3381                 :     }
    3382 ECB             : 
    3383                 :     /*
    3384                 :      * If we didn't pin the visibility map page and the page has become all
    3385                 :      * visible while we were busy locking the buffer, or during some
    3386                 :      * subsequent window during which we had it unlocked, we'll have to unlock
    3387                 :      * and re-lock, to avoid holding the buffer lock across an I/O.  That's a
    3388                 :      * bit unfortunate, especially since we'll now have to recheck whether the
    3389                 :      * tuple has been locked or updated under us, but hopefully it won't
    3390                 :      * happen very often.
    3391                 :      */
    3392 GIC      418880 :     if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
    3393                 :     {
    3394 LBC           0 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3395               0 :         visibilitymap_pin(relation, block, &vmbuffer);
    3396               0 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    3397 UIC           0 :         goto l2;
    3398 ECB             :     }
    3399                 : 
    3400                 :     /* Fill in transaction status data */
    3401                 : 
    3402                 :     /*
    3403                 :      * If the tuple we're updating is locked, we need to preserve the locking
    3404                 :      * info in the old tuple's Xmax.  Prepare a new Xmax value for this.
    3405                 :      */
    3406 GIC      418880 :     compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
    3407          418880 :                               oldtup.t_data->t_infomask,
    3408          418880 :                               oldtup.t_data->t_infomask2,
    3409                 :                               xid, *lockmode, true,
    3410                 :                               &xmax_old_tuple, &infomask_old_tuple,
    3411                 :                               &infomask2_old_tuple);
    3412                 : 
    3413 ECB             :     /*
    3414                 :      * And also prepare an Xmax value for the new copy of the tuple.  If there
    3415                 :      * was no xmax previously, or there was one but all lockers are now gone,
    3416                 :      * then use InvalidTransactionId; otherwise, get the xmax from the old
    3417                 :      * tuple.  (In rare cases that might also be InvalidTransactionId and yet
    3418                 :      * not have the HEAP_XMAX_INVALID bit set; that's fine.)
    3419                 :      */
    3420 CBC      418880 :     if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
    3421 GIC       35788 :         HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
    3422           35736 :         (checked_lockers && !locker_remains))
    3423          383092 :         xmax_new_tuple = InvalidTransactionId;
    3424                 :     else
    3425 CBC       35788 :         xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
    3426 ECB             : 
    3427 GIC      418880 :     if (!TransactionIdIsValid(xmax_new_tuple))
    3428 ECB             :     {
    3429 CBC      383092 :         infomask_new_tuple = HEAP_XMAX_INVALID;
    3430          383092 :         infomask2_new_tuple = 0;
    3431 ECB             :     }
    3432                 :     else
    3433                 :     {
    3434                 :         /*
    3435                 :          * If we found a valid Xmax for the new tuple, then the infomask bits
    3436                 :          * to use on the new tuple depend on what was there on the old one.
    3437                 :          * Note that since we're doing an update, the only possibility is that
    3438                 :          * the lockers had FOR KEY SHARE lock.
    3439                 :          */
    3440 GIC       35788 :         if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
    3441 ECB             :         {
    3442 GIC          53 :             GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
    3443                 :                                    &infomask2_new_tuple);
    3444                 :         }
    3445                 :         else
    3446                 :         {
    3447           35735 :             infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
    3448           35735 :             infomask2_new_tuple = 0;
    3449                 :         }
    3450 ECB             :     }
    3451                 : 
    3452                 :     /*
    3453                 :      * Prepare the new tuple with the appropriate initial values of Xmin and
    3454                 :      * Xmax, as well as initial infomask bits as computed above.
    3455                 :      */
    3456 GIC      418880 :     newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
    3457 CBC      418880 :     newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
    3458 GIC      418880 :     HeapTupleHeaderSetXmin(newtup->t_data, xid);
    3459          418880 :     HeapTupleHeaderSetCmin(newtup->t_data, cid);
    3460          418880 :     newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
    3461          418880 :     newtup->t_data->t_infomask2 |= infomask2_new_tuple;
    3462          418880 :     HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
    3463                 : 
    3464                 :     /*
    3465                 :      * Replace cid with a combo CID if necessary.  Note that we already put
    3466                 :      * the plain cid into the new tuple.
    3467                 :      */
    3468          418880 :     HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
    3469                 : 
    3470                 :     /*
    3471                 :      * If the toaster needs to be activated, OR if the new tuple will not fit
    3472                 :      * on the same page as the old, then we need to release the content lock
    3473                 :      * (but not the pin!) on the old tuple's buffer while we are off doing
    3474                 :      * TOAST and/or table-file-extension work.  We must mark the old tuple to
    3475                 :      * show that it's locked, else other processes may try to update it
    3476                 :      * themselves.
    3477                 :      *
    3478                 :      * We need to invoke the toaster if there are already any out-of-line
    3479                 :      * toasted values present, or if the new tuple is over-threshold.
    3480                 :      */
    3481          418880 :     if (relation->rd_rel->relkind != RELKIND_RELATION &&
    3482 UIC           0 :         relation->rd_rel->relkind != RELKIND_MATVIEW)
    3483                 :     {
    3484 ECB             :         /* toast table entries should never be recursively toasted */
    3485 UIC           0 :         Assert(!HeapTupleHasExternal(&oldtup));
    3486               0 :         Assert(!HeapTupleHasExternal(newtup));
    3487 LBC           0 :         need_toast = false;
    3488                 :     }
    3489                 :     else
    3490 GIC      418880 :         need_toast = (HeapTupleHasExternal(&oldtup) ||
    3491          837523 :                       HeapTupleHasExternal(newtup) ||
    3492 CBC      418643 :                       newtup->t_len > TOAST_TUPLE_THRESHOLD);
    3493                 : 
    3494 GIC      418880 :     pagefree = PageGetHeapFreeSpace(page);
    3495 ECB             : 
    3496 GBC      418880 :     newtupsize = MAXALIGN(newtup->t_len);
    3497                 : 
    3498 CBC      418880 :     if (need_toast || newtupsize > pagefree)
    3499 GIC      196455 :     {
    3500 ECB             :         TransactionId xmax_lock_old_tuple;
    3501                 :         uint16      infomask_lock_old_tuple,
    3502                 :                     infomask2_lock_old_tuple;
    3503 GIC      196455 :         bool        cleared_all_frozen = false;
    3504                 : 
    3505                 :         /*
    3506                 :          * To prevent concurrent sessions from updating the tuple, we have to
    3507                 :          * temporarily mark it locked, while we release the page-level lock.
    3508                 :          *
    3509 ECB             :          * To satisfy the rule that any xid potentially appearing in a buffer
    3510                 :          * written out to disk, we unfortunately have to WAL log this
    3511                 :          * temporary modification.  We can reuse xl_heap_lock for this
    3512                 :          * purpose.  If we crash/error before following through with the
    3513                 :          * actual update, xmax will be of an aborted transaction, allowing
    3514                 :          * other sessions to proceed.
    3515                 :          */
    3516                 : 
    3517                 :         /*
    3518                 :          * Compute xmax / infomask appropriate for locking the tuple. This has
    3519                 :          * to be done separately from the combo that's going to be used for
    3520                 :          * updating, because the potentially created multixact would otherwise
    3521                 :          * be wrong.
    3522                 :          */
    3523 CBC      196455 :         compute_new_xmax_infomask(HeapTupleHeaderGetRawXmax(oldtup.t_data),
    3524 GIC      196455 :                                   oldtup.t_data->t_infomask,
    3525          196455 :                                   oldtup.t_data->t_infomask2,
    3526                 :                                   xid, *lockmode, false,
    3527                 :                                   &xmax_lock_old_tuple, &infomask_lock_old_tuple,
    3528                 :                                   &infomask2_lock_old_tuple);
    3529                 : 
    3530          196455 :         Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
    3531                 : 
    3532          196455 :         START_CRIT_SECTION();
    3533                 : 
    3534                 :         /* Clear obsolete visibility flags ... */
    3535          196455 :         oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    3536          196455 :         oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    3537          196455 :         HeapTupleClearHotUpdated(&oldtup);
    3538                 :         /* ... and store info about transaction updating this tuple */
    3539          196455 :         Assert(TransactionIdIsValid(xmax_lock_old_tuple));
    3540 CBC      196455 :         HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
    3541 GIC      196455 :         oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
    3542          196455 :         oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
    3543          196455 :         HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
    3544                 : 
    3545                 :         /* temporarily make it look not-updated, but locked */
    3546          196455 :         oldtup.t_data->t_ctid = oldtup.t_self;
    3547                 : 
    3548                 :         /*
    3549 ECB             :          * Clear all-frozen bit on visibility map if needed. We could
    3550                 :          * immediately reset ALL_VISIBLE, but given that the WAL logging
    3551                 :          * overhead would be unchanged, that doesn't seem necessarily
    3552                 :          * worthwhile.
    3553                 :          */
    3554 GIC      196959 :         if (PageIsAllVisible(page) &&
    3555             504 :             visibilitymap_clear(relation, block, vmbuffer,
    3556 ECB             :                                 VISIBILITYMAP_ALL_FROZEN))
    3557 GIC         351 :             cleared_all_frozen = true;
    3558 ECB             : 
    3559 GIC      196455 :         MarkBufferDirty(buffer);
    3560                 : 
    3561          196455 :         if (RelationNeedsWAL(relation))
    3562                 :         {
    3563                 :             xl_heap_lock xlrec;
    3564                 :             XLogRecPtr  recptr;
    3565                 : 
    3566          186329 :             XLogBeginInsert();
    3567 CBC      186329 :             XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    3568 ECB             : 
    3569 GIC      186329 :             xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
    3570          186329 :             xlrec.locking_xid = xmax_lock_old_tuple;
    3571          372658 :             xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
    3572          186329 :                                                   oldtup.t_data->t_infomask2);
    3573          186329 :             xlrec.flags =
    3574 CBC      186329 :                 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
    3575 GIC      186329 :             XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
    3576          186329 :             recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
    3577          186329 :             PageSetLSN(page, recptr);
    3578                 :         }
    3579                 : 
    3580          196455 :         END_CRIT_SECTION();
    3581                 : 
    3582          196455 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3583                 : 
    3584 ECB             :         /*
    3585                 :          * Let the toaster do its thing, if needed.
    3586                 :          *
    3587                 :          * Note: below this point, heaptup is the data we actually intend to
    3588                 :          * store into the relation; newtup is the caller's original untoasted
    3589                 :          * data.
    3590                 :          */
    3591 GIC      196455 :         if (need_toast)
    3592                 :         {
    3593                 :             /* Note we always use WAL and FSM during updates */
    3594            3085 :             heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
    3595            3085 :             newtupsize = MAXALIGN(heaptup->t_len);
    3596                 :         }
    3597                 :         else
    3598          193370 :             heaptup = newtup;
    3599                 : 
    3600                 :         /*
    3601                 :          * Now, do we need a new page for the tuple, or not?  This is a bit
    3602                 :          * tricky since someone else could have added tuples to the page while
    3603                 :          * we weren't looking.  We have to recheck the available space after
    3604 ECB             :          * reacquiring the buffer lock.  But don't bother to do that if the
    3605                 :          * former amount of free space is still not enough; it's unlikely
    3606                 :          * there's more free now than before.
    3607                 :          *
    3608                 :          * What's more, if we need to get a new page, we will need to acquire
    3609                 :          * buffer locks on both old and new pages.  To avoid deadlock against
    3610                 :          * some other backend trying to get the same two locks in the other
    3611                 :          * order, we must be consistent about the order we get the locks in.
    3612                 :          * We use the rule "lock the lower-numbered page of the relation
    3613                 :          * first".  To implement this, we must do RelationGetBufferForTuple
    3614                 :          * while not holding the lock on the old page, and we must rely on it
    3615                 :          * to get the locks on both pages in the correct order.
    3616                 :          *
    3617                 :          * Another consideration is that we need visibility map page pin(s) if
    3618                 :          * we will have to clear the all-visible flag on either page.  If we
    3619                 :          * call RelationGetBufferForTuple, we rely on it to acquire any such
    3620                 :          * pins; but if we don't, we have to handle that here.  Hence we need
    3621                 :          * a loop.
    3622                 :          */
    3623                 :         for (;;)
    3624                 :         {
    3625 GIC      196456 :             if (newtupsize > pagefree)
    3626                 :             {
    3627 ECB             :                 /* It doesn't fit, must use RelationGetBufferForTuple. */
    3628 CBC      196178 :                 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
    3629                 :                                                    buffer, 0, NULL,
    3630                 :                                                    &vmbuffer_new, &vmbuffer,
    3631                 :                                                    0);
    3632 ECB             :                 /* We're all done. */
    3633 CBC      196178 :                 break;
    3634 ECB             :             }
    3635                 :             /* Acquire VM page pin if needed and we don't have it. */
    3636 GIC         278 :             if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
    3637 UIC           0 :                 visibilitymap_pin(relation, block, &vmbuffer);
    3638 ECB             :             /* Re-acquire the lock on the old tuple's page. */
    3639 GIC         278 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    3640                 :             /* Re-check using the up-to-date free space */
    3641 CBC         278 :             pagefree = PageGetHeapFreeSpace(page);
    3642 GIC         278 :             if (newtupsize > pagefree ||
    3643 CBC         277 :                 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
    3644 ECB             :             {
    3645                 :                 /*
    3646                 :                  * Rats, it doesn't fit anymore, or somebody just now set the
    3647                 :                  * all-visible flag.  We must now unlock and loop to avoid
    3648                 :                  * deadlock.  Fortunately, this path should seldom be taken.
    3649                 :                  */
    3650 CBC           1 :                 LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3651 ECB             :             }
    3652                 :             else
    3653                 :             {
    3654                 :                 /* We're all done. */
    3655 GIC         277 :                 newbuf = buffer;
    3656 CBC         277 :                 break;
    3657 ECB             :             }
    3658                 :         }
    3659                 :     }
    3660                 :     else
    3661                 :     {
    3662                 :         /* No TOAST work needed, and it'll fit on same page */
    3663 GIC      222425 :         newbuf = buffer;
    3664          222425 :         heaptup = newtup;
    3665                 :     }
    3666                 : 
    3667                 :     /*
    3668                 :      * We're about to do the actual update -- check for conflict first, to
    3669 ECB             :      * avoid possibly having to roll back work we've just done.
    3670                 :      *
    3671                 :      * This is safe without a recheck as long as there is no possibility of
    3672                 :      * another process scanning the pages between this check and the update
    3673                 :      * being visible to the scan (i.e., exclusive buffer content lock(s) are
    3674                 :      * continuously held from this point until the tuple update is visible).
    3675                 :      *
    3676                 :      * For the new tuple the only check needed is at the relation level, but
    3677                 :      * since both tuples are in the same relation and the check for oldtup
    3678                 :      * will include checking the relation level, there is no benefit to a
    3679                 :      * separate check for the new tuple.
    3680                 :      */
    3681 GIC      418880 :     CheckForSerializableConflictIn(relation, &oldtup.t_self,
    3682 ECB             :                                    BufferGetBlockNumber(buffer));
    3683                 : 
    3684                 :     /*
    3685                 :      * At this point newbuf and buffer are both pinned and locked, and newbuf
    3686                 :      * has enough space for the new tuple.  If they are the same buffer, only
    3687                 :      * one pin is held.
    3688                 :      */
    3689                 : 
    3690 CBC      418868 :     if (newbuf == buffer)
    3691 ECB             :     {
    3692                 :         /*
    3693                 :          * Since the new tuple is going into the same page, we might be able
    3694                 :          * to do a HOT update.  Check if any of the index columns have been
    3695                 :          * changed.
    3696                 :          */
    3697 GIC      222690 :         if (!bms_overlap(modified_attrs, hot_attrs))
    3698                 :         {
    3699          213765 :             use_hot_update = true;
    3700                 : 
    3701                 :             /*
    3702                 :              * If none of the columns that are used in hot-blocking indexes
    3703                 :              * were updated, we can apply HOT, but we do still need to check
    3704                 :              * if we need to update the summarizing indexes, and update those
    3705                 :              * indexes if the columns were updated, or we may fail to detect
    3706                 :              * e.g. value bound changes in BRIN minmax indexes.
    3707                 :              */
    3708 GNC      213765 :             if (bms_overlap(modified_attrs, sum_attrs))
    3709            1641 :                 summarized_update = true;
    3710                 :         }
    3711                 :     }
    3712                 :     else
    3713 ECB             :     {
    3714                 :         /* Set a hint that the old page could use prune/defrag */
    3715 GIC      196178 :         PageSetFull(page);
    3716 ECB             :     }
    3717                 : 
    3718                 :     /*
    3719                 :      * Compute replica identity tuple before entering the critical section so
    3720                 :      * we don't PANIC upon a memory allocation failure.
    3721                 :      * ExtractReplicaIdentity() will return NULL if nothing needs to be
    3722                 :      * logged.  Pass old key required as true only if the replica identity key
    3723                 :      * columns are modified or it has external data.
    3724                 :      */
    3725 GIC      418868 :     old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
    3726          418868 :                                            bms_overlap(modified_attrs, id_attrs) ||
    3727 ECB             :                                            id_has_external,
    3728                 :                                            &old_key_copied);
    3729                 : 
    3730                 :     /* NO EREPORT(ERROR) from here till changes are logged */
    3731 GIC      418868 :     START_CRIT_SECTION();
    3732                 : 
    3733                 :     /*
    3734                 :      * If this transaction commits, the old tuple will become DEAD sooner or
    3735                 :      * later.  Set flag that this page is a candidate for pruning once our xid
    3736 ECB             :      * falls below the OldestXmin horizon.  If the transaction finally aborts,
    3737                 :      * the subsequent page pruning will be a no-op and the hint will be
    3738                 :      * cleared.
    3739                 :      *
    3740                 :      * XXX Should we set hint on newbuf as well?  If the transaction aborts,
    3741                 :      * there would be a prunable tuple in the newbuf; but for now we choose
    3742                 :      * not to optimize for aborts.  Note that heap_xlog_update must be kept in
    3743                 :      * sync if this decision changes.
    3744                 :      */
    3745 GIC      418868 :     PageSetPrunable(page, xid);
    3746                 : 
    3747          418868 :     if (use_hot_update)
    3748 ECB             :     {
    3749                 :         /* Mark the old tuple as HOT-updated */
    3750 CBC      213765 :         HeapTupleSetHotUpdated(&oldtup);
    3751 ECB             :         /* And mark the new tuple as heap-only */
    3752 GIC      213765 :         HeapTupleSetHeapOnly(heaptup);
    3753 ECB             :         /* Mark the caller's copy too, in case different from heaptup */
    3754 GIC      213765 :         HeapTupleSetHeapOnly(newtup);
    3755                 :     }
    3756 ECB             :     else
    3757                 :     {
    3758                 :         /* Make sure tuples are correctly marked as not-HOT */
    3759 CBC      205103 :         HeapTupleClearHotUpdated(&oldtup);
    3760 GIC      205103 :         HeapTupleClearHeapOnly(heaptup);
    3761 CBC      205103 :         HeapTupleClearHeapOnly(newtup);
    3762 ECB             :     }
    3763                 : 
    3764 CBC      418868 :     RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
    3765 ECB             : 
    3766                 : 
    3767                 :     /* Clear obsolete visibility flags, possibly set by ourselves above... */
    3768 CBC      418868 :     oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    3769 GIC      418868 :     oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    3770                 :     /* ... and store info about transaction updating this tuple */
    3771          418868 :     Assert(TransactionIdIsValid(xmax_old_tuple));
    3772          418868 :     HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
    3773          418868 :     oldtup.t_data->t_infomask |= infomask_old_tuple;
    3774          418868 :     oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
    3775          418868 :     HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
    3776 ECB             : 
    3777                 :     /* record address of new tuple in t_ctid of old one */
    3778 GIC      418868 :     oldtup.t_data->t_ctid = heaptup->t_self;
    3779                 : 
    3780                 :     /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
    3781          418868 :     if (PageIsAllVisible(BufferGetPage(buffer)))
    3782                 :     {
    3783             961 :         all_visible_cleared = true;
    3784             961 :         PageClearAllVisible(BufferGetPage(buffer));
    3785 CBC         961 :         visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
    3786 ECB             :                             vmbuffer, VISIBILITYMAP_VALID_BITS);
    3787                 :     }
    3788 GIC      418868 :     if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
    3789                 :     {
    3790             362 :         all_visible_cleared_new = true;
    3791 CBC         362 :         PageClearAllVisible(BufferGetPage(newbuf));
    3792             362 :         visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
    3793                 :                             vmbuffer_new, VISIBILITYMAP_VALID_BITS);
    3794                 :     }
    3795                 : 
    3796 GIC      418868 :     if (newbuf != buffer)
    3797          196178 :         MarkBufferDirty(newbuf);
    3798          418868 :     MarkBufferDirty(buffer);
    3799                 : 
    3800                 :     /* XLOG stuff */
    3801          418868 :     if (RelationNeedsWAL(relation))
    3802                 :     {
    3803 ECB             :         XLogRecPtr  recptr;
    3804                 : 
    3805                 :         /*
    3806 EUB             :          * For logical decoding we need combo CIDs to properly decode the
    3807                 :          * catalog.
    3808                 :          */
    3809 GIC      407557 :         if (RelationIsAccessibleInLogicalDecoding(relation))
    3810 ECB             :         {
    3811 CBC        1873 :             log_heap_new_cid(relation, &oldtup);
    3812            1873 :             log_heap_new_cid(relation, heaptup);
    3813                 :         }
    3814                 : 
    3815 GIC      407557 :         recptr = log_heap_update(relation, buffer,
    3816                 :                                  newbuf, &oldtup, heaptup,
    3817                 :                                  old_key_tuple,
    3818                 :                                  all_visible_cleared,
    3819                 :                                  all_visible_cleared_new);
    3820          407557 :         if (newbuf != buffer)
    3821                 :         {
    3822          186055 :             PageSetLSN(BufferGetPage(newbuf), recptr);
    3823                 :         }
    3824          407557 :         PageSetLSN(BufferGetPage(buffer), recptr);
    3825                 :     }
    3826                 : 
    3827 CBC      418868 :     END_CRIT_SECTION();
    3828                 : 
    3829 GIC      418868 :     if (newbuf != buffer)
    3830          196178 :         LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
    3831          418868 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    3832                 : 
    3833                 :     /*
    3834 ECB             :      * Mark old tuple for invalidation from system caches at next command
    3835                 :      * boundary, and mark the new tuple for invalidation in case we abort. We
    3836                 :      * have to do this before releasing the buffer because oldtup is in the
    3837                 :      * buffer.  (heaptup is all in local memory, but it's necessary to process
    3838                 :      * both tuple versions in one call to inval.c so we can avoid redundant
    3839                 :      * sinval messages.)
    3840                 :      */
    3841 CBC      418868 :     CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
    3842                 : 
    3843                 :     /* Now we can release the buffer(s) */
    3844 GIC      418868 :     if (newbuf != buffer)
    3845          196178 :         ReleaseBuffer(newbuf);
    3846          418868 :     ReleaseBuffer(buffer);
    3847          418868 :     if (BufferIsValid(vmbuffer_new))
    3848             363 :         ReleaseBuffer(vmbuffer_new);
    3849          418868 :     if (BufferIsValid(vmbuffer))
    3850             961 :         ReleaseBuffer(vmbuffer);
    3851                 : 
    3852 ECB             :     /*
    3853                 :      * Release the lmgr tuple lock, if we had it.
    3854 EUB             :      */
    3855 CBC      418868 :     if (have_tuple_lock)
    3856 GIC          12 :         UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
    3857                 : 
    3858 GNC      418868 :     pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
    3859                 : 
    3860                 :     /*
    3861                 :      * If heaptup is a private copy, release it.  Don't forget to copy t_self
    3862                 :      * back to the caller's image, too.
    3863 ECB             :      */
    3864 GIC      418868 :     if (heaptup != newtup)
    3865 EUB             :     {
    3866 GIC        3038 :         newtup->t_self = heaptup->t_self;
    3867 GBC        3038 :         heap_freetuple(heaptup);
    3868 EUB             :     }
    3869                 : 
    3870                 :     /*
    3871                 :      * If it is a HOT update, the update may still need to update summarized
    3872                 :      * indexes, lest we fail to update those summaries and get incorrect
    3873                 :      * results (for example, minmax bounds of the block may change with this
    3874                 :      * update).
    3875                 :      */
    3876 GNC      418868 :     if (use_hot_update)
    3877                 :     {
    3878          213765 :         if (summarized_update)
    3879            1641 :             *update_indexes = TU_Summarizing;
    3880                 :         else
    3881          212124 :             *update_indexes = TU_None;
    3882                 :     }
    3883                 :     else
    3884          205103 :         *update_indexes = TU_All;
    3885                 : 
    3886 GIC      418868 :     if (old_key_tuple != NULL && old_key_copied)
    3887             141 :         heap_freetuple(old_key_tuple);
    3888                 : 
    3889          418868 :     bms_free(hot_attrs);
    3890 GNC      418868 :     bms_free(sum_attrs);
    3891 GIC      418868 :     bms_free(key_attrs);
    3892          418868 :     bms_free(id_attrs);
    3893          418868 :     bms_free(modified_attrs);
    3894          418868 :     bms_free(interesting_attrs);
    3895 ECB             : 
    3896 CBC      418868 :     return TM_Ok;
    3897                 : }
    3898 ECB             : 
    3899                 : /*
    3900                 :  * Check if the specified attribute's values are the same.  Subroutine for
    3901                 :  * HeapDetermineColumnsInfo.
    3902                 :  */
    3903                 : static bool
    3904 GIC     1234805 : heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
    3905                 :                  bool isnull1, bool isnull2)
    3906                 : {
    3907                 :     Form_pg_attribute att;
    3908                 : 
    3909 ECB             :     /*
    3910                 :      * If one value is NULL and other is not, then they are certainly not
    3911                 :      * equal
    3912                 :      */
    3913 GIC     1234805 :     if (isnull1 != isnull2)
    3914               3 :         return false;
    3915                 : 
    3916                 :     /*
    3917 ECB             :      * If both are NULL, they can be considered equal.
    3918                 :      */
    3919 CBC     1234802 :     if (isnull1)
    3920 GIC        4991 :         return true;
    3921                 : 
    3922 ECB             :     /*
    3923                 :      * We do simple binary comparison of the two datums.  This may be overly
    3924                 :      * strict because there can be multiple binary representations for the
    3925                 :      * same logical value.  But we should be OK as long as there are no false
    3926                 :      * positives.  Using a type-specific equality operator is messy because
    3927                 :      * there could be multiple notions of equality in different operator
    3928                 :      * classes; furthermore, we cannot safely invoke user-defined functions
    3929                 :      * while holding exclusive buffer lock.
    3930                 :      */
    3931 GIC     1229811 :     if (attrnum <= 0)
    3932                 :     {
    3933                 :         /* The only allowed system columns are OIDs, so do this */
    3934 LBC           0 :         return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
    3935                 :     }
    3936                 :     else
    3937                 :     {
    3938 GIC     1229811 :         Assert(attrnum <= tupdesc->natts);
    3939         1229811 :         att = TupleDescAttr(tupdesc, attrnum - 1);
    3940         1229811 :         return datumIsEqual(value1, value2, att->attbyval, att->attlen);
    3941 ECB             :     }
    3942                 : }
    3943                 : 
    3944                 : /*
    3945                 :  * Check which columns are being updated.
    3946                 :  *
    3947 EUB             :  * Given an updated tuple, determine (and return into the output bitmapset),
    3948                 :  * from those listed as interesting, the set of columns that changed.
    3949                 :  *
    3950                 :  * has_external indicates if any of the unmodified attributes (from those
    3951                 :  * listed as interesting) of the old tuple is a member of external_cols and is
    3952 ECB             :  * stored externally.
    3953 EUB             :  */
    3954                 : static Bitmapset *
    3955 GIC      418994 : HeapDetermineColumnsInfo(Relation relation,
    3956                 :                          Bitmapset *interesting_cols,
    3957 EUB             :                          Bitmapset *external_cols,
    3958                 :                          HeapTuple oldtup, HeapTuple newtup,
    3959                 :                          bool *has_external)
    3960                 : {
    3961                 :     int         attidx;
    3962 GBC      418994 :     Bitmapset  *modified = NULL;
    3963 GIC      418994 :     TupleDesc   tupdesc = RelationGetDescr(relation);
    3964                 : 
    3965 GNC      418994 :     attidx = -1;
    3966         1653799 :     while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
    3967                 :     {
    3968                 :         /* attidx is zero-based, attrnum is the normal attribute number */
    3969         1234805 :         AttrNumber  attrnum = attidx + FirstLowInvalidHeapAttributeNumber;
    3970                 :         Datum       value1,
    3971                 :                     value2;
    3972                 :         bool        isnull1,
    3973                 :                     isnull2;
    3974                 : 
    3975                 :         /*
    3976                 :          * If it's a whole-tuple reference, say "not equal".  It's not really
    3977 ECB             :          * worth supporting this case, since it could only succeed after a
    3978                 :          * no-op update, which is hardly a case worth optimizing for.
    3979                 :          */
    3980 CBC     1234805 :         if (attrnum == 0)
    3981                 :         {
    3982 UNC           0 :             modified = bms_add_member(modified, attidx);
    3983 GIC     1170399 :             continue;
    3984 ECB             :         }
    3985                 : 
    3986                 :         /*
    3987                 :          * Likewise, automatically say "not equal" for any system attribute
    3988                 :          * other than tableOID; we cannot expect these to be consistent in a
    3989                 :          * HOT chain, or even to be set correctly yet in the new tuple.
    3990                 :          */
    3991 GIC     1234805 :         if (attrnum < 0)
    3992                 :         {
    3993 UIC           0 :             if (attrnum != TableOidAttributeNumber)
    3994                 :             {
    3995 UNC           0 :                 modified = bms_add_member(modified, attidx);
    3996 UIC           0 :                 continue;
    3997                 :             }
    3998                 :         }
    3999                 : 
    4000                 :         /*
    4001                 :          * Extract the corresponding values.  XXX this is pretty inefficient
    4002                 :          * if there are many indexed columns.  Should we do a single
    4003                 :          * heap_deform_tuple call on each tuple, instead?   But that doesn't
    4004                 :          * work for system columns ...
    4005                 :          */
    4006 GIC     1234805 :         value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
    4007         1234805 :         value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
    4008                 : 
    4009         1234805 :         if (!heap_attr_equals(tupdesc, attrnum, value1,
    4010                 :                               value2, isnull1, isnull2))
    4011                 :         {
    4012 GNC       18483 :             modified = bms_add_member(modified, attidx);
    4013 GIC       18483 :             continue;
    4014                 :         }
    4015 ECB             : 
    4016                 :         /*
    4017                 :          * No need to check attributes that can't be stored externally. Note
    4018                 :          * that system attributes can't be stored externally.
    4019                 :          */
    4020 GIC     1216322 :         if (attrnum < 0 || isnull1 ||
    4021 CBC     1211331 :             TupleDescAttr(tupdesc, attrnum - 1)->attlen != -1)
    4022 GIC     1151916 :             continue;
    4023                 : 
    4024 ECB             :         /*
    4025                 :          * Check if the old tuple's attribute is stored externally and is a
    4026                 :          * member of external_cols.
    4027                 :          */
    4028 GIC       64411 :         if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) &&
    4029 GNC           5 :             bms_is_member(attidx, external_cols))
    4030 CBC           2 :             *has_external = true;
    4031 ECB             :     }
    4032                 : 
    4033 CBC      418994 :     return modified;
    4034                 : }
    4035 ECB             : 
    4036                 : /*
    4037                 :  *  simple_heap_update - replace a tuple
    4038                 :  *
    4039                 :  * This routine may be used to update a tuple when concurrent updates of
    4040                 :  * the target tuple are not expected (for example, because we have a lock
    4041                 :  * on the relation associated with the tuple).  Any failure is reported
    4042                 :  * via ereport().
    4043                 :  */
    4044                 : void
    4045 GNC      201216 : simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup,
    4046                 :                    TU_UpdateIndexes *update_indexes)
    4047                 : {
    4048 ECB             :     TM_Result   result;
    4049                 :     TM_FailureData tmfd;
    4050                 :     LockTupleMode lockmode;
    4051                 : 
    4052 CBC      201216 :     result = heap_update(relation, otid, tup,
    4053                 :                          GetCurrentCommandId(true), InvalidSnapshot,
    4054 ECB             :                          true /* wait for commit */ ,
    4055                 :                          &tmfd, &lockmode, update_indexes);
    4056 CBC      201216 :     switch (result)
    4057                 :     {
    4058 LBC           0 :         case TM_SelfModified:
    4059 ECB             :             /* Tuple was already updated in current command? */
    4060 UIC           0 :             elog(ERROR, "tuple already updated by self");
    4061 ECB             :             break;
    4062                 : 
    4063 GIC      201216 :         case TM_Ok:
    4064                 :             /* done successfully */
    4065          201216 :             break;
    4066                 : 
    4067 UIC           0 :         case TM_Updated:
    4068               0 :             elog(ERROR, "tuple concurrently updated");
    4069 ECB             :             break;
    4070                 : 
    4071 UIC           0 :         case TM_Deleted:
    4072 LBC           0 :             elog(ERROR, "tuple concurrently deleted");
    4073 ECB             :             break;
    4074                 : 
    4075 UIC           0 :         default:
    4076               0 :             elog(ERROR, "unrecognized heap_update status: %u", result);
    4077                 :             break;
    4078                 :     }
    4079 GIC      201216 : }
    4080                 : 
    4081                 : 
    4082                 : /*
    4083 ECB             :  * Return the MultiXactStatus corresponding to the given tuple lock mode.
    4084                 :  */
    4085                 : static MultiXactStatus
    4086 CBC        1188 : get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
    4087                 : {
    4088 ECB             :     int         retval;
    4089                 : 
    4090 GIC        1188 :     if (is_update)
    4091              96 :         retval = tupleLockExtraInfo[mode].updstatus;
    4092                 :     else
    4093            1092 :         retval = tupleLockExtraInfo[mode].lockstatus;
    4094                 : 
    4095            1188 :     if (retval == -1)
    4096 UIC           0 :         elog(ERROR, "invalid lock tuple mode %d/%s", mode,
    4097                 :              is_update ? "true" : "false");
    4098                 : 
    4099 GIC        1188 :     return (MultiXactStatus) retval;
    4100                 : }
    4101                 : 
    4102 ECB             : /*
    4103                 :  *  heap_lock_tuple - lock a tuple in shared or exclusive mode
    4104                 :  *
    4105                 :  * Note that this acquires a buffer pin, which the caller must release.
    4106                 :  *
    4107                 :  * Input parameters:
    4108                 :  *  relation: relation containing tuple (caller must hold suitable lock)
    4109                 :  *  tid: TID of tuple to lock
    4110                 :  *  cid: current command ID (used for visibility test, and stored into
    4111                 :  *      tuple's cmax if lock is successful)
    4112                 :  *  mode: indicates if shared or exclusive tuple lock is desired
    4113                 :  *  wait_policy: what to do if tuple lock is not available
    4114                 :  *  follow_updates: if true, follow the update chain to also lock descendant
    4115                 :  *      tuples.
    4116                 :  *
    4117                 :  * Output parameters:
    4118                 :  *  *tuple: all fields filled in
    4119                 :  *  *buffer: set to buffer holding tuple (pinned but not locked at exit)
    4120                 :  *  *tmfd: filled in failure cases (see below)
    4121                 :  *
    4122                 :  * Function results are the same as the ones for table_tuple_lock().
    4123                 :  *
    4124                 :  * In the failure cases other than TM_Invisible, the routine fills
    4125                 :  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
    4126                 :  * if necessary), and t_cmax (the last only for TM_SelfModified,
    4127                 :  * since we cannot obtain cmax from a combo CID generated by another
    4128                 :  * transaction).
    4129                 :  * See comments for struct TM_FailureData for additional info.
    4130                 :  *
    4131                 :  * See README.tuplock for a thorough explanation of this mechanism.
    4132                 :  */
    4133                 : TM_Result
    4134 GIC       82482 : heap_lock_tuple(Relation relation, HeapTuple tuple,
    4135                 :                 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
    4136                 :                 bool follow_updates,
    4137                 :                 Buffer *buffer, TM_FailureData *tmfd)
    4138                 : {
    4139                 :     TM_Result   result;
    4140           82482 :     ItemPointer tid = &(tuple->t_self);
    4141                 :     ItemId      lp;
    4142                 :     Page        page;
    4143           82482 :     Buffer      vmbuffer = InvalidBuffer;
    4144                 :     BlockNumber block;
    4145                 :     TransactionId xid,
    4146 ECB             :                 xmax;
    4147                 :     uint16      old_infomask,
    4148                 :                 new_infomask,
    4149                 :                 new_infomask2;
    4150 CBC       82482 :     bool        first_time = true;
    4151           82482 :     bool        skip_tuple_lock = false;
    4152 GIC       82482 :     bool        have_tuple_lock = false;
    4153 CBC       82482 :     bool        cleared_all_frozen = false;
    4154                 : 
    4155           82482 :     *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
    4156 GIC       82482 :     block = ItemPointerGetBlockNumber(tid);
    4157 ECB             : 
    4158                 :     /*
    4159                 :      * Before locking the buffer, pin the visibility map page if it appears to
    4160                 :      * be necessary.  Since we haven't got the lock yet, someone else might be
    4161                 :      * in the middle of changing this, so we'll need to recheck after we have
    4162                 :      * the lock.
    4163                 :      */
    4164 CBC       82482 :     if (PageIsAllVisible(BufferGetPage(*buffer)))
    4165            1651 :         visibilitymap_pin(relation, block, &vmbuffer);
    4166                 : 
    4167           82482 :     LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4168 ECB             : 
    4169 GIC       82482 :     page = BufferGetPage(*buffer);
    4170 CBC       82482 :     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
    4171           82482 :     Assert(ItemIdIsNormal(lp));
    4172 ECB             : 
    4173 GIC       82482 :     tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
    4174 CBC       82482 :     tuple->t_len = ItemIdGetLength(lp);
    4175           82482 :     tuple->t_tableOid = RelationGetRelid(relation);
    4176                 : 
    4177              14 : l3:
    4178           82496 :     result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
    4179 ECB             : 
    4180 CBC       82496 :     if (result == TM_Invisible)
    4181                 :     {
    4182 ECB             :         /*
    4183                 :          * This is possible, but only when locking a tuple for ON CONFLICT
    4184                 :          * UPDATE.  We return this value here rather than throwing an error in
    4185                 :          * order to give that case the opportunity to throw a more specific
    4186                 :          * error.
    4187                 :          */
    4188 GIC          12 :         result = TM_Invisible;
    4189              12 :         goto out_locked;
    4190                 :     }
    4191           82484 :     else if (result == TM_BeingModified ||
    4192           75528 :              result == TM_Updated ||
    4193                 :              result == TM_Deleted)
    4194                 :     {
    4195 ECB             :         TransactionId xwait;
    4196                 :         uint16      infomask;
    4197                 :         uint16      infomask2;
    4198                 :         bool        require_sleep;
    4199                 :         ItemPointerData t_ctid;
    4200                 : 
    4201                 :         /* must copy state data before unlocking buffer */
    4202 GIC        6958 :         xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
    4203            6958 :         infomask = tuple->t_data->t_infomask;
    4204            6958 :         infomask2 = tuple->t_data->t_infomask2;
    4205            6958 :         ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
    4206                 : 
    4207            6958 :         LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
    4208                 : 
    4209                 :         /*
    4210                 :          * If any subtransaction of the current top transaction already holds
    4211                 :          * a lock as strong as or stronger than what we're requesting, we
    4212                 :          * effectively hold the desired lock already.  We *must* succeed
    4213                 :          * without trying to take the tuple lock, else we will deadlock
    4214                 :          * against anyone wanting to acquire a stronger lock.
    4215                 :          *
    4216                 :          * Note we only do this the first time we loop on the HTSU result;
    4217                 :          * there is no point in testing in subsequent passes, because
    4218                 :          * evidently our own transaction cannot have acquired a new lock after
    4219 ECB             :          * the first time we checked.
    4220                 :          */
    4221 GIC        6958 :         if (first_time)
    4222                 :         {
    4223 CBC        6949 :             first_time = false;
    4224                 : 
    4225 GIC        6949 :             if (infomask & HEAP_XMAX_IS_MULTI)
    4226                 :             {
    4227                 :                 int         i;
    4228                 :                 int         nmembers;
    4229 ECB             :                 MultiXactMember *members;
    4230                 : 
    4231                 :                 /*
    4232                 :                  * We don't need to allow old multixacts here; if that had
    4233                 :                  * been the case, HeapTupleSatisfiesUpdate would have returned
    4234                 :                  * MayBeUpdated and we wouldn't be here.
    4235                 :                  */
    4236                 :                 nmembers =
    4237 GIC          84 :                     GetMultiXactIdMembers(xwait, &members, false,
    4238 CBC          84 :                                           HEAP_XMAX_IS_LOCKED_ONLY(infomask));
    4239                 : 
    4240             251 :                 for (i = 0; i < nmembers; i++)
    4241 ECB             :                 {
    4242                 :                     /* only consider members of our own transaction */
    4243 GIC         181 :                     if (!TransactionIdIsCurrentTransactionId(members[i].xid))
    4244             132 :                         continue;
    4245 ECB             : 
    4246 GIC          49 :                     if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
    4247                 :                     {
    4248              14 :                         pfree(members);
    4249              14 :                         result = TM_Ok;
    4250              14 :                         goto out_unlocked;
    4251                 :                     }
    4252                 :                     else
    4253                 :                     {
    4254 ECB             :                         /*
    4255                 :                          * Disable acquisition of the heavyweight tuple lock.
    4256                 :                          * Otherwise, when promoting a weaker lock, we might
    4257                 :                          * deadlock with another locker that has acquired the
    4258                 :                          * heavyweight tuple lock and is waiting for our
    4259                 :                          * transaction to finish.
    4260                 :                          *
    4261                 :                          * Note that in this case we still need to wait for
    4262                 :                          * the multixact if required, to avoid acquiring
    4263                 :                          * conflicting locks.
    4264                 :                          */
    4265 GIC          35 :                         skip_tuple_lock = true;
    4266                 :                     }
    4267                 :                 }
    4268                 : 
    4269              70 :                 if (members)
    4270              70 :                     pfree(members);
    4271 ECB             :             }
    4272 GIC        6865 :             else if (TransactionIdIsCurrentTransactionId(xwait))
    4273                 :             {
    4274            5697 :                 switch (mode)
    4275                 :                 {
    4276             123 :                     case LockTupleKeyShare:
    4277 CBC         123 :                         Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
    4278 ECB             :                                HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
    4279                 :                                HEAP_XMAX_IS_EXCL_LOCKED(infomask));
    4280 CBC         123 :                         result = TM_Ok;
    4281 GIC         123 :                         goto out_unlocked;
    4282             116 :                     case LockTupleShare:
    4283             116 :                         if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
    4284               6 :                             HEAP_XMAX_IS_EXCL_LOCKED(infomask))
    4285                 :                         {
    4286 CBC         110 :                             result = TM_Ok;
    4287             110 :                             goto out_unlocked;
    4288 EUB             :                         }
    4289 CBC           6 :                         break;
    4290 GIC          55 :                     case LockTupleNoKeyExclusive:
    4291              55 :                         if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
    4292 ECB             :                         {
    4293 GIC          44 :                             result = TM_Ok;
    4294              44 :                             goto out_unlocked;
    4295                 :                         }
    4296              11 :                         break;
    4297            5403 :                     case LockTupleExclusive:
    4298            5403 :                         if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
    4299 CBC         364 :                             infomask2 & HEAP_KEYS_UPDATED)
    4300                 :                         {
    4301             349 :                             result = TM_Ok;
    4302 GIC         349 :                             goto out_unlocked;
    4303                 :                         }
    4304            5054 :                         break;
    4305                 :                 }
    4306                 :             }
    4307                 :         }
    4308 ECB             : 
    4309                 :         /*
    4310                 :          * Initially assume that we will have to wait for the locking
    4311                 :          * transaction(s) to finish.  We check various cases below in which
    4312 EUB             :          * this can be turned off.
    4313                 :          */
    4314 GIC        6318 :         require_sleep = true;
    4315 CBC        6318 :         if (mode == LockTupleKeyShare)
    4316                 :         {
    4317                 :             /*
    4318 ECB             :              * If we're requesting KeyShare, and there's no update present, we
    4319                 :              * don't need to wait.  Even if there is an update, we can still
    4320                 :              * continue if the key hasn't been modified.
    4321                 :              *
    4322                 :              * However, if there are updates, we need to walk the update chain
    4323                 :              * to mark future versions of the row as locked, too.  That way,
    4324                 :              * if somebody deletes that future version, we're protected
    4325                 :              * against the key going away.  This locking of future versions
    4326 EUB             :              * could block momentarily, if a concurrent transaction is
    4327                 :              * deleting a key; or it could return a value to the effect that
    4328 ECB             :              * the transaction deleting the key has already committed.  So we
    4329                 :              * do this before re-locking the buffer; otherwise this would be
    4330                 :              * prone to deadlocks.
    4331                 :              *
    4332                 :              * Note that the TID we're locking was grabbed before we unlocked
    4333                 :              * the buffer.  For it to change while we're not looking, the
    4334                 :              * other properties we're testing for below after re-locking the
    4335                 :              * buffer would also change, in which case we would restart this
    4336                 :              * loop above.
    4337                 :              */
    4338 GIC         572 :             if (!(infomask2 & HEAP_KEYS_UPDATED))
    4339                 :             {
    4340                 :                 bool        updated;
    4341                 : 
    4342             541 :                 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
    4343                 : 
    4344 ECB             :                 /*
    4345                 :                  * If there are updates, follow the update chain; bail out if
    4346                 :                  * that cannot be done.
    4347                 :                  */
    4348 CBC         541 :                 if (follow_updates && updated)
    4349 ECB             :                 {
    4350                 :                     TM_Result   res;
    4351                 : 
    4352 GBC          50 :                     res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
    4353 ECB             :                                                   GetCurrentTransactionId(),
    4354                 :                                                   mode);
    4355 GIC          50 :                     if (res != TM_Ok)
    4356                 :                     {
    4357               6 :                         result = res;
    4358                 :                         /* recovery code expects to have buffer lock held */
    4359               6 :                         LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4360             138 :                         goto failed;
    4361                 :                     }
    4362                 :                 }
    4363                 : 
    4364             535 :                 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4365                 : 
    4366                 :                 /*
    4367                 :                  * Make sure it's still an appropriate lock, else start over.
    4368 ECB             :                  * Also, if it wasn't updated before we released the lock, but
    4369                 :                  * is updated now, we start over too; the reason is that we
    4370                 :                  * now need to follow the update chain to lock the new
    4371                 :                  * versions.
    4372                 :                  */
    4373 CBC         535 :                 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
    4374 GIC          43 :                     ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
    4375              43 :                      !updated))
    4376              14 :                     goto l3;
    4377                 : 
    4378                 :                 /* Things look okay, so we can skip sleeping */
    4379             535 :                 require_sleep = false;
    4380                 : 
    4381                 :                 /*
    4382                 :                  * Note we allow Xmax to change here; other updaters/lockers
    4383                 :                  * could have modified it before we grabbed the buffer lock.
    4384                 :                  * However, this is not a problem, because with the recheck we
    4385 ECB             :                  * just did we ensure that they still don't conflict with the
    4386                 :                  * lock we want.
    4387                 :                  */
    4388                 :             }
    4389                 :         }
    4390 GIC        5746 :         else if (mode == LockTupleShare)
    4391                 :         {
    4392                 :             /*
    4393 ECB             :              * If we're requesting Share, we can similarly avoid sleeping if
    4394                 :              * there's no update and no exclusive lock present.
    4395                 :              */
    4396 CBC         439 :             if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
    4397 GIC         439 :                 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
    4398                 :             {
    4399 CBC         433 :                 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4400                 : 
    4401 ECB             :                 /*
    4402                 :                  * Make sure it's still an appropriate lock, else start over.
    4403                 :                  * See above about allowing xmax to change.
    4404                 :                  */
    4405 GBC         433 :                 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
    4406 GIC         433 :                     HEAP_XMAX_IS_EXCL_LOCKED(tuple->t_data->t_infomask))
    4407 UIC           0 :                     goto l3;
    4408 CBC         433 :                 require_sleep = false;
    4409                 :             }
    4410 ECB             :         }
    4411 CBC        5307 :         else if (mode == LockTupleNoKeyExclusive)
    4412                 :         {
    4413 ECB             :             /*
    4414                 :              * If we're requesting NoKeyExclusive, we might also be able to
    4415                 :              * avoid sleeping; just ensure that there no conflicting lock
    4416                 :              * already acquired.
    4417                 :              */
    4418 GIC         123 :             if (infomask & HEAP_XMAX_IS_MULTI)
    4419 ECB             :             {
    4420 GIC          26 :                 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
    4421 ECB             :                                              mode, NULL))
    4422                 :                 {
    4423                 :                     /*
    4424 EUB             :                      * No conflict, but if the xmax changed under us in the
    4425 ECB             :                      * meantime, start over.
    4426                 :                      */
    4427 GIC          13 :                     LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4428              13 :                     if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
    4429 CBC          13 :                         !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
    4430                 :                                              xwait))
    4431 UIC           0 :                         goto l3;
    4432                 : 
    4433                 :                     /* otherwise, we're good */
    4434 GBC          13 :                     require_sleep = false;
    4435                 :                 }
    4436                 :             }
    4437 GIC          97 :             else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
    4438                 :             {
    4439              15 :                 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4440                 : 
    4441                 :                 /* if the xmax changed in the meantime, start over */
    4442              15 :                 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
    4443              15 :                     !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
    4444                 :                                          xwait))
    4445 UIC           0 :                     goto l3;
    4446                 :                 /* otherwise, we're good */
    4447 GIC          15 :                 require_sleep = false;
    4448                 :             }
    4449                 :         }
    4450 ECB             : 
    4451                 :         /*
    4452                 :          * As a check independent from those above, we can also avoid sleeping
    4453                 :          * if the current transaction is the sole locker of the tuple.  Note
    4454                 :          * that the strength of the lock already held is irrelevant; this is
    4455                 :          * not about recording the lock in Xmax (which will be done regardless
    4456                 :          * of this optimization, below).  Also, note that the cases where we
    4457                 :          * hold a lock stronger than we are requesting are already handled
    4458                 :          * above by not doing anything.
    4459                 :          *
    4460                 :          * Note we only deal with the non-multixact case here; MultiXactIdWait
    4461                 :          * is well equipped to deal with this situation on its own.
    4462                 :          */
    4463 GIC       11587 :         if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
    4464 GBC        5275 :             TransactionIdIsCurrentTransactionId(xwait))
    4465 ECB             :         {
    4466                 :             /* ... but if the xmax changed in the meantime, start over */
    4467 CBC        5054 :             LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4468 GIC        5054 :             if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
    4469            5054 :                 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
    4470                 :                                      xwait))
    4471 UBC           0 :                 goto l3;
    4472 GIC        5054 :             Assert(HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask));
    4473            5054 :             require_sleep = false;
    4474                 :         }
    4475                 : 
    4476 ECB             :         /*
    4477                 :          * Time to sleep on the other transaction/multixact, if necessary.
    4478                 :          *
    4479                 :          * If the other transaction is an update/delete that's already
    4480                 :          * committed, then sleeping cannot possibly do any good: if we're
    4481                 :          * required to sleep, get out to raise an error instead.
    4482                 :          *
    4483                 :          * By here, we either have already acquired the buffer exclusive lock,
    4484                 :          * or we must wait for the locking transaction or multixact; so below
    4485                 :          * we ensure that we grab buffer lock after the sleep.
    4486                 :          */
    4487 CBC        6312 :         if (require_sleep && (result == TM_Updated || result == TM_Deleted))
    4488 ECB             :         {
    4489 GIC          94 :             LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4490              94 :             goto failed;
    4491                 :         }
    4492 CBC        6218 :         else if (require_sleep)
    4493                 :         {
    4494                 :             /*
    4495                 :              * Acquire tuple lock to establish our priority for the tuple, or
    4496                 :              * die trying.  LockTuple will release us when we are next-in-line
    4497                 :              * for the tuple.  We must do this even if we are share-locking,
    4498                 :              * but not if we already have a weaker lock on the tuple.
    4499 ECB             :              *
    4500                 :              * If we are forced to "start over" below, we keep the tuple lock;
    4501                 :              * this arranges that we stay at the head of the line while
    4502                 :              * rechecking tuple state.
    4503                 :              */
    4504 CBC         168 :             if (!skip_tuple_lock &&
    4505 GIC         152 :                 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
    4506                 :                                       &have_tuple_lock))
    4507                 :             {
    4508                 :                 /*
    4509                 :                  * This can only happen if wait_policy is Skip and the lock
    4510                 :                  * couldn't be obtained.
    4511                 :                  */
    4512               1 :                 result = TM_WouldBlock;
    4513                 :                 /* recovery code expects to have buffer lock held */
    4514 CBC           1 :                 LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4515 GIC           1 :                 goto failed;
    4516                 :             }
    4517                 : 
    4518             166 :             if (infomask & HEAP_XMAX_IS_MULTI)
    4519                 :             {
    4520              40 :                 MultiXactStatus status = get_mxact_status_for_lock(mode, false);
    4521                 : 
    4522                 :                 /* We only ever lock tuples, never update them */
    4523              40 :                 if (status >= MultiXactStatusNoKeyUpdate)
    4524 UIC           0 :                     elog(ERROR, "invalid lock mode in heap_lock_tuple");
    4525 ECB             : 
    4526                 :                 /* wait for multixact to end, or die trying  */
    4527 CBC          40 :                 switch (wait_policy)
    4528 ECB             :                 {
    4529 CBC          36 :                     case LockWaitBlock:
    4530              36 :                         MultiXactIdWait((MultiXactId) xwait, status, infomask,
    4531 ECB             :                                         relation, &tuple->t_self, XLTW_Lock, NULL);
    4532 GIC          36 :                         break;
    4533 CBC           2 :                     case LockWaitSkip:
    4534 GIC           2 :                         if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
    4535                 :                                                         status, infomask, relation,
    4536 ECB             :                                                         NULL))
    4537                 :                         {
    4538 GIC           2 :                             result = TM_WouldBlock;
    4539 ECB             :                             /* recovery code expects to have buffer lock held */
    4540 GIC           2 :                             LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4541               2 :                             goto failed;
    4542                 :                         }
    4543 UIC           0 :                         break;
    4544 GIC           2 :                     case LockWaitError:
    4545               2 :                         if (!ConditionalMultiXactIdWait((MultiXactId) xwait,
    4546                 :                                                         status, infomask, relation,
    4547                 :                                                         NULL))
    4548               2 :                             ereport(ERROR,
    4549 ECB             :                                     (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
    4550                 :                                      errmsg("could not obtain lock on row in relation \"%s\"",
    4551                 :                                             RelationGetRelationName(relation))));
    4552                 : 
    4553 LBC           0 :                         break;
    4554 ECB             :                 }
    4555                 : 
    4556                 :                 /*
    4557                 :                  * Of course, the multixact might not be done here: if we're
    4558                 :                  * requesting a light lock mode, other transactions with light
    4559                 :                  * locks could still be alive, as well as locks owned by our
    4560                 :                  * own xact or other subxacts of this backend.  We need to
    4561                 :                  * preserve the surviving MultiXact members.  Note that it
    4562                 :                  * isn't absolutely necessary in the latter case, but doing so
    4563                 :                  * is simpler.
    4564                 :                  */
    4565                 :             }
    4566                 :             else
    4567                 :             {
    4568                 :                 /* wait for regular transaction to end, or die trying */
    4569 GIC         126 :                 switch (wait_policy)
    4570                 :                 {
    4571 CBC          87 :                     case LockWaitBlock:
    4572 GIC          87 :                         XactLockTableWait(xwait, relation, &tuple->t_self,
    4573 EUB             :                                           XLTW_Lock);
    4574 GBC          87 :                         break;
    4575              33 :                     case LockWaitSkip:
    4576              33 :                         if (!ConditionalXactLockTableWait(xwait))
    4577                 :                         {
    4578 GIC          33 :                             result = TM_WouldBlock;
    4579 ECB             :                             /* recovery code expects to have buffer lock held */
    4580 CBC          33 :                             LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4581 GIC          33 :                             goto failed;
    4582                 :                         }
    4583 UIC           0 :                         break;
    4584 GIC           6 :                     case LockWaitError:
    4585               6 :                         if (!ConditionalXactLockTableWait(xwait))
    4586               6 :                             ereport(ERROR,
    4587                 :                                     (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
    4588                 :                                      errmsg("could not obtain lock on row in relation \"%s\"",
    4589                 :                                             RelationGetRelationName(relation))));
    4590 LBC           0 :                         break;
    4591                 :                 }
    4592                 :             }
    4593                 : 
    4594                 :             /* if there are updates, follow the update chain */
    4595 GIC         123 :             if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
    4596                 :             {
    4597 ECB             :                 TM_Result   res;
    4598                 : 
    4599 GIC          38 :                 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
    4600                 :                                               GetCurrentTransactionId(),
    4601 ECB             :                                               mode);
    4602 GIC          38 :                 if (res != TM_Ok)
    4603                 :                 {
    4604               2 :                     result = res;
    4605                 :                     /* recovery code expects to have buffer lock held */
    4606               2 :                     LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4607               2 :                     goto failed;
    4608                 :                 }
    4609                 :             }
    4610                 : 
    4611             121 :             LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4612                 : 
    4613 ECB             :             /*
    4614                 :              * xwait is done, but if xwait had just locked the tuple then some
    4615                 :              * other xact could update this tuple before we get to this point.
    4616                 :              * Check for xmax change, and start over if so.
    4617                 :              */
    4618 CBC         121 :             if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
    4619             109 :                 !TransactionIdEquals(HeapTupleHeaderGetRawXmax(tuple->t_data),
    4620                 :                                      xwait))
    4621 GIC          14 :                 goto l3;
    4622                 : 
    4623             107 :             if (!(infomask & HEAP_XMAX_IS_MULTI))
    4624                 :             {
    4625                 :                 /*
    4626                 :                  * Otherwise check if it committed or aborted.  Note we cannot
    4627                 :                  * be here if the tuple was only locked by somebody who didn't
    4628 ECB             :                  * conflict with us; that would have been handled above.  So
    4629                 :                  * that transaction must necessarily be gone by now.  But
    4630                 :                  * don't check for this in the multixact case, because some
    4631                 :                  * locker transactions might still be running.
    4632                 :                  */
    4633 CBC          76 :                 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
    4634                 :             }
    4635 ECB             :         }
    4636                 : 
    4637                 :         /* By here, we're certain that we hold buffer exclusive lock again */
    4638                 : 
    4639                 :         /*
    4640                 :          * We may lock if previous xmax aborted, or if it committed but only
    4641                 :          * locked the tuple without updating it; or if we didn't have to wait
    4642                 :          * at all for whatever reason.
    4643                 :          */
    4644 GIC        6157 :         if (!require_sleep ||
    4645             107 :             (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
    4646             139 :             HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
    4647              62 :             HeapTupleHeaderIsOnlyLocked(tuple->t_data))
    4648            6101 :             result = TM_Ok;
    4649              56 :         else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
    4650              45 :             result = TM_Updated;
    4651                 :         else
    4652 CBC          11 :             result = TM_Deleted;
    4653                 :     }
    4654                 : 
    4655 GIC       75526 : failed:
    4656           81821 :     if (result != TM_Ok)
    4657 ECB             :     {
    4658 CBC         200 :         Assert(result == TM_SelfModified || result == TM_Updated ||
    4659                 :                result == TM_Deleted || result == TM_WouldBlock);
    4660 ECB             : 
    4661                 :         /*
    4662                 :          * When locking a tuple under LockWaitSkip semantics and we fail with
    4663                 :          * TM_WouldBlock above, it's possible for concurrent transactions to
    4664                 :          * release the lock and set HEAP_XMAX_INVALID in the meantime.  So
    4665                 :          * this assert is slightly different from the equivalent one in
    4666                 :          * heap_delete and heap_update.
    4667                 :          */
    4668 GIC         200 :         Assert((result == TM_WouldBlock) ||
    4669 ECB             :                !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
    4670 GIC         200 :         Assert(result != TM_Updated ||
    4671 ECB             :                !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
    4672 GIC         200 :         tmfd->ctid = tuple->t_data->t_ctid;
    4673             200 :         tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
    4674 CBC         200 :         if (result == TM_SelfModified)
    4675 GIC           6 :             tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
    4676 ECB             :         else
    4677 GIC         194 :             tmfd->cmax = InvalidCommandId;
    4678 CBC         200 :         goto out_locked;
    4679 ECB             :     }
    4680                 : 
    4681                 :     /*
    4682                 :      * If we didn't pin the visibility map page and the page has become all
    4683                 :      * visible while we were busy locking the buffer, or during some
    4684                 :      * subsequent window during which we had it unlocked, we'll have to unlock
    4685                 :      * and re-lock, to avoid holding the buffer lock across I/O.  That's a bit
    4686                 :      * unfortunate, especially since we'll now have to recheck whether the
    4687                 :      * tuple has been locked or updated under us, but hopefully it won't
    4688                 :      * happen very often.
    4689                 :      */
    4690 GIC       81621 :     if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
    4691                 :     {
    4692 UIC           0 :         LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
    4693               0 :         visibilitymap_pin(relation, block, &vmbuffer);
    4694 LBC           0 :         LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
    4695               0 :         goto l3;
    4696                 :     }
    4697 ECB             : 
    4698 GIC       81621 :     xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
    4699           81621 :     old_infomask = tuple->t_data->t_infomask;
    4700                 : 
    4701                 :     /*
    4702                 :      * If this is the first possibly-multixact-able operation in the current
    4703                 :      * transaction, set my per-backend OldestMemberMXactId setting. We can be
    4704                 :      * certain that the transaction will never become a member of any older
    4705                 :      * MultiXactIds than that.  (We have to do this even if we end up just
    4706                 :      * using our own TransactionId below, since some other backend could
    4707                 :      * incorporate our XID into a MultiXact immediately afterwards.)
    4708                 :      */
    4709           81621 :     MultiXactIdSetOldestMember();
    4710                 : 
    4711                 :     /*
    4712                 :      * Compute the new xmax and infomask to store into the tuple.  Note we do
    4713 ECB             :      * not modify the tuple just yet, because that would leave it in the wrong
    4714                 :      * state if multixact.c elogs.
    4715                 :      */
    4716 CBC       81621 :     compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
    4717 ECB             :                               GetCurrentTransactionId(), mode, false,
    4718                 :                               &xid, &new_infomask, &new_infomask2);
    4719                 : 
    4720 GIC       81621 :     START_CRIT_SECTION();
    4721 ECB             : 
    4722                 :     /*
    4723                 :      * Store transaction information of xact locking the tuple.
    4724                 :      *
    4725                 :      * Note: Cmax is meaningless in this context, so don't set it; this avoids
    4726                 :      * possibly generating a useless combo CID.  Moreover, if we're locking a
    4727                 :      * previously updated tuple, it's important to preserve the Cmax.
    4728                 :      *
    4729                 :      * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
    4730                 :      * we would break the HOT chain.
    4731                 :      */
    4732 CBC       81621 :     tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
    4733 GIC       81621 :     tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    4734           81621 :     tuple->t_data->t_infomask |= new_infomask;
    4735           81621 :     tuple->t_data->t_infomask2 |= new_infomask2;
    4736 CBC       81621 :     if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
    4737 GIC       81582 :         HeapTupleHeaderClearHotUpdated(tuple->t_data);
    4738 CBC       81621 :     HeapTupleHeaderSetXmax(tuple->t_data, xid);
    4739                 : 
    4740 ECB             :     /*
    4741                 :      * Make sure there is no forward chain link in t_ctid.  Note that in the
    4742                 :      * cases where the tuple has been updated, we must not overwrite t_ctid,
    4743                 :      * because it was set by the updater.  Moreover, if the tuple has been
    4744                 :      * updated, we need to follow the update chain to lock the new versions of
    4745                 :      * the tuple as well.
    4746                 :      */
    4747 GIC       81621 :     if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
    4748           81582 :         tuple->t_data->t_ctid = *tid;
    4749                 : 
    4750                 :     /* Clear only the all-frozen bit on visibility map if needed */
    4751           83272 :     if (PageIsAllVisible(page) &&
    4752            1651 :         visibilitymap_clear(relation, block, vmbuffer,
    4753                 :                             VISIBILITYMAP_ALL_FROZEN))
    4754              15 :         cleared_all_frozen = true;
    4755                 : 
    4756                 : 
    4757           81621 :     MarkBufferDirty(*buffer);
    4758                 : 
    4759                 :     /*
    4760                 :      * XLOG stuff.  You might think that we don't need an XLOG record because
    4761                 :      * there is no state change worth restoring after a crash.  You would be
    4762 ECB             :      * wrong however: we have just written either a TransactionId or a
    4763                 :      * MultiXactId that may never have been seen on disk before, and we need
    4764                 :      * to make sure that there are XLOG entries covering those ID numbers.
    4765                 :      * Else the same IDs might be re-used after a crash, which would be
    4766                 :      * disastrous if this page made it to disk before the crash.  Essentially
    4767                 :      * we have to enforce the WAL log-before-data rule even in this case.
    4768                 :      * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
    4769                 :      * entries for everything anyway.)
    4770                 :      */
    4771 GIC       81621 :     if (RelationNeedsWAL(relation))
    4772 ECB             :     {
    4773                 :         xl_heap_lock xlrec;
    4774                 :         XLogRecPtr  recptr;
    4775                 : 
    4776 CBC       81066 :         XLogBeginInsert();
    4777           81066 :         XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
    4778                 : 
    4779 GIC       81066 :         xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
    4780           81066 :         xlrec.locking_xid = xid;
    4781          162132 :         xlrec.infobits_set = compute_infobits(new_infomask,
    4782           81066 :                                               tuple->t_data->t_infomask2);
    4783           81066 :         xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
    4784           81066 :         XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
    4785                 : 
    4786                 :         /* we don't decode row locks atm, so no need to log the origin */
    4787 ECB             : 
    4788 GIC       81066 :         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
    4789 ECB             : 
    4790 CBC       81066 :         PageSetLSN(page, recptr);
    4791 ECB             :     }
    4792                 : 
    4793 GIC       81621 :     END_CRIT_SECTION();
    4794                 : 
    4795 CBC       81621 :     result = TM_Ok;
    4796 ECB             : 
    4797 GIC       81833 : out_locked:
    4798 CBC       81833 :     LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
    4799 ECB             : 
    4800 CBC       82473 : out_unlocked:
    4801           82473 :     if (BufferIsValid(vmbuffer))
    4802            1651 :         ReleaseBuffer(vmbuffer);
    4803 ECB             : 
    4804                 :     /*
    4805                 :      * Don't update the visibility map here. Locking a tuple doesn't change
    4806                 :      * visibility info.
    4807                 :      */
    4808                 : 
    4809                 :     /*
    4810                 :      * Now that we have successfully marked the tuple as locked, we can
    4811                 :      * release the lmgr tuple lock, if we had it.
    4812                 :      */
    4813 CBC       82473 :     if (have_tuple_lock)
    4814             137 :         UnlockTupleTuplock(relation, tid, mode);
    4815 EUB             : 
    4816 GBC       82473 :     return result;
    4817 EUB             : }
    4818                 : 
    4819                 : /*
    4820                 :  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
    4821 ECB             :  * its normal, Xmax-based tuple lock.
    4822                 :  *
    4823                 :  * have_tuple_lock is an input and output parameter: on input, it indicates
    4824                 :  * whether the lock has previously been acquired (and this function does
    4825                 :  * nothing in that case).  If this function returns success, have_tuple_lock
    4826                 :  * has been flipped to true.
    4827                 :  *
    4828                 :  * Returns false if it was unable to obtain the lock; this can only happen if
    4829                 :  * wait_policy is Skip.
    4830                 :  */
    4831                 : static bool
    4832 GIC         246 : heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode,
    4833                 :                      LockWaitPolicy wait_policy, bool *have_tuple_lock)
    4834                 : {
    4835             246 :     if (*have_tuple_lock)
    4836               9 :         return true;
    4837                 : 
    4838 CBC         237 :     switch (wait_policy)
    4839                 :     {
    4840 GBC         196 :         case LockWaitBlock:
    4841             196 :             LockTupleTuplock(relation, tid, mode);
    4842             196 :             break;
    4843                 : 
    4844 GIC          34 :         case LockWaitSkip:
    4845              34 :             if (!ConditionalLockTupleTuplock(relation, tid, mode))
    4846               1 :                 return false;
    4847              33 :             break;
    4848                 : 
    4849               7 :         case LockWaitError:
    4850               7 :             if (!ConditionalLockTupleTuplock(relation, tid, mode))
    4851               1 :                 ereport(ERROR,
    4852                 :                         (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
    4853                 :                          errmsg("could not obtain lock on row in relation \"%s\"",
    4854                 :                                 RelationGetRelationName(relation))));
    4855               6 :             break;
    4856                 :     }
    4857 CBC         235 :     *have_tuple_lock = true;
    4858                 : 
    4859             235 :     return true;
    4860 ECB             : }
    4861                 : 
    4862                 : /*
    4863                 :  * Given an original set of Xmax and infomask, and a transaction (identified by
    4864                 :  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
    4865                 :  * corresponding infomasks to use on the tuple.
    4866                 :  *
    4867                 :  * Note that this might have side effects such as creating a new MultiXactId.
    4868                 :  *
    4869                 :  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
    4870                 :  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
    4871                 :  * but it was not running anymore. There is a race condition, which is that the
    4872                 :  * MultiXactId may have finished since then, but that uncommon case is handled
    4873                 :  * either here, or within MultiXactIdExpand.
    4874                 :  *
    4875                 :  * There is a similar race condition possible when the old xmax was a regular
    4876                 :  * TransactionId.  We test TransactionIdIsInProgress again just to narrow the
    4877                 :  * window, but it's still possible to end up creating an unnecessary
    4878                 :  * MultiXactId.  Fortunately this is harmless.
    4879                 :  */
    4880                 : static void
    4881 GIC     2117116 : compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
    4882                 :                           uint16 old_infomask2, TransactionId add_to_xmax,
    4883                 :                           LockTupleMode mode, bool is_update,
    4884                 :                           TransactionId *result_xmax, uint16 *result_infomask,
    4885                 :                           uint16 *result_infomask2)
    4886                 : {
    4887                 :     TransactionId new_xmax;
    4888 ECB             :     uint16      new_infomask,
    4889 EUB             :                 new_infomask2;
    4890                 : 
    4891 CBC     2117116 :     Assert(TransactionIdIsCurrentTransactionId(add_to_xmax));
    4892                 : 
    4893         2220971 : l5:
    4894 GIC     2220971 :     new_infomask = 0;
    4895         2220971 :     new_infomask2 = 0;
    4896         2220971 :     if (old_infomask & HEAP_XMAX_INVALID)
    4897                 :     {
    4898                 :         /*
    4899                 :          * No previous locker; we just insert our own TransactionId.
    4900 ECB             :          *
    4901                 :          * Note that it's critical that this case be the first one checked,
    4902                 :          * because there are several blocks below that come back to this one
    4903                 :          * to implement certain optimizations; old_infomask might contain
    4904                 :          * other dirty bits in those cases, but we don't really care.
    4905                 :          */
    4906 GIC     2116000 :         if (is_update)
    4907                 :         {
    4908         1838882 :             new_xmax = add_to_xmax;
    4909         1838882 :             if (mode == LockTupleExclusive)
    4910         1455536 :                 new_infomask2 |= HEAP_KEYS_UPDATED;
    4911                 :         }
    4912                 :         else
    4913                 :         {
    4914 CBC      277118 :             new_infomask |= HEAP_XMAX_LOCK_ONLY;
    4915 GIC      277118 :             switch (mode)
    4916 ECB             :             {
    4917 CBC        2202 :                 case LockTupleKeyShare:
    4918            2202 :                     new_xmax = add_to_xmax;
    4919            2202 :                     new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
    4920            2202 :                     break;
    4921 GIC         706 :                 case LockTupleShare:
    4922 CBC         706 :                     new_xmax = add_to_xmax;
    4923             706 :                     new_infomask |= HEAP_XMAX_SHR_LOCK;
    4924 GIC         706 :                     break;
    4925 CBC      179670 :                 case LockTupleNoKeyExclusive:
    4926 GIC      179670 :                     new_xmax = add_to_xmax;
    4927          179670 :                     new_infomask |= HEAP_XMAX_EXCL_LOCK;
    4928          179670 :                     break;
    4929           94540 :                 case LockTupleExclusive:
    4930           94540 :                     new_xmax = add_to_xmax;
    4931           94540 :                     new_infomask |= HEAP_XMAX_EXCL_LOCK;
    4932           94540 :                     new_infomask2 |= HEAP_KEYS_UPDATED;
    4933           94540 :                     break;
    4934 UIC           0 :                 default:
    4935 UBC           0 :                     new_xmax = InvalidTransactionId;    /* silence compiler */
    4936               0 :                     elog(ERROR, "invalid lock mode");
    4937 EUB             :             }
    4938                 :         }
    4939                 :     }
    4940 GIC      104971 :     else if (old_infomask & HEAP_XMAX_IS_MULTI)
    4941                 :     {
    4942                 :         MultiXactStatus new_status;
    4943                 : 
    4944 ECB             :         /*
    4945 EUB             :          * Currently we don't allow XMAX_COMMITTED to be set for multis, so
    4946                 :          * cross-check.
    4947 ECB             :          */
    4948 GIC         120 :         Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
    4949                 : 
    4950 ECB             :         /*
    4951                 :          * A multixact together with LOCK_ONLY set but neither lock bit set
    4952                 :          * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
    4953                 :          * anymore.  This check is critical for databases upgraded by
    4954                 :          * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
    4955                 :          * that such multis are never passed.
    4956                 :          */
    4957 CBC         120 :         if (HEAP_LOCKED_UPGRADED(old_infomask))
    4958                 :         {
    4959 UIC           0 :             old_infomask &= ~HEAP_XMAX_IS_MULTI;
    4960               0 :             old_infomask |= HEAP_XMAX_INVALID;
    4961               0 :             goto l5;
    4962                 :         }
    4963                 : 
    4964                 :         /*
    4965                 :          * If the XMAX is already a MultiXactId, then we need to expand it to
    4966                 :          * include add_to_xmax; but if all the members were lockers and are
    4967                 :          * all gone, we can do away with the IS_MULTI bit and just set
    4968 ECB             :          * add_to_xmax as the only locker/updater.  If all lockers are gone
    4969                 :          * and we have an updater that aborted, we can also do without a
    4970                 :          * multi.
    4971                 :          *
    4972                 :          * The cost of doing GetMultiXactIdMembers would be paid by
    4973                 :          * MultiXactIdExpand if we weren't to do this, so this check is not
    4974                 :          * incurring extra work anyhow.
    4975                 :          */
    4976 CBC         120 :         if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
    4977                 :         {
    4978 GIC          23 :             if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
    4979               8 :                 !TransactionIdDidCommit(MultiXactIdGetUpdateXid(xmax,
    4980 ECB             :                                                                 old_infomask)))
    4981                 :             {
    4982                 :                 /*
    4983                 :                  * Reset these bits and restart; otherwise fall through to
    4984                 :                  * create a new multi below.
    4985                 :                  */
    4986 CBC          23 :                 old_infomask &= ~HEAP_XMAX_IS_MULTI;
    4987              23 :                 old_infomask |= HEAP_XMAX_INVALID;
    4988 GIC          23 :                 goto l5;
    4989                 :             }
    4990                 :         }
    4991                 : 
    4992              97 :         new_status = get_mxact_status_for_lock(mode, is_update);
    4993                 : 
    4994              97 :         new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
    4995 ECB             :                                      new_status);
    4996 GBC          97 :         GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
    4997                 :     }
    4998 CBC      104851 :     else if (old_infomask & HEAP_XMAX_COMMITTED)
    4999                 :     {
    5000 ECB             :         /*
    5001                 :          * It's a committed update, so we need to preserve him as updater of
    5002                 :          * the tuple.
    5003                 :          */
    5004                 :         MultiXactStatus status;
    5005                 :         MultiXactStatus new_status;
    5006                 : 
    5007 CBC          13 :         if (old_infomask2 & HEAP_KEYS_UPDATED)
    5008 LBC           0 :             status = MultiXactStatusUpdate;
    5009                 :         else
    5010 GIC          13 :             status = MultiXactStatusNoKeyUpdate;
    5011                 : 
    5012              13 :         new_status = get_mxact_status_for_lock(mode, is_update);
    5013                 : 
    5014                 :         /*
    5015                 :          * since it's not running, it's obviously impossible for the old
    5016                 :          * updater to be identical to the current one, so we need not check
    5017                 :          * for that case as we do in the block above.
    5018 ECB             :          */
    5019 CBC          13 :         new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
    5020 GIC          13 :         GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
    5021                 :     }
    5022 CBC      104838 :     else if (TransactionIdIsInProgress(xmax))
    5023 ECB             :     {
    5024                 :         /*
    5025                 :          * If the XMAX is a valid, in-progress TransactionId, then we need to
    5026                 :          * create a new MultiXactId that includes both the old locker or
    5027                 :          * updater and our own TransactionId.
    5028                 :          */
    5029                 :         MultiXactStatus new_status;
    5030                 :         MultiXactStatus old_status;
    5031                 :         LockTupleMode old_mode;
    5032                 : 
    5033 GIC      104829 :         if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
    5034                 :         {
    5035          104803 :             if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
    5036            5603 :                 old_status = MultiXactStatusForKeyShare;
    5037           99200 :             else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
    5038             429 :                 old_status = MultiXactStatusForShare;
    5039           98771 :             else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
    5040                 :             {
    5041           98771 :                 if (old_infomask2 & HEAP_KEYS_UPDATED)
    5042           92666 :                     old_status = MultiXactStatusForUpdate;
    5043 ECB             :                 else
    5044 GIC        6105 :                     old_status = MultiXactStatusForNoKeyUpdate;
    5045                 :             }
    5046                 :             else
    5047                 :             {
    5048                 :                 /*
    5049 ECB             :                  * LOCK_ONLY can be present alone only when a page has been
    5050                 :                  * upgraded by pg_upgrade.  But in that case,
    5051                 :                  * TransactionIdIsInProgress() should have returned false.  We
    5052                 :                  * assume it's no longer locked in this case.
    5053                 :                  */
    5054 UIC           0 :                 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
    5055               0 :                 old_infomask |= HEAP_XMAX_INVALID;
    5056               0 :                 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
    5057 LBC           0 :                 goto l5;
    5058                 :             }
    5059                 :         }
    5060                 :         else
    5061                 :         {
    5062                 :             /* it's an update, but which kind? */
    5063 GIC          26 :             if (old_infomask2 & HEAP_KEYS_UPDATED)
    5064 UBC           0 :                 old_status = MultiXactStatusUpdate;
    5065                 :             else
    5066 CBC          26 :                 old_status = MultiXactStatusNoKeyUpdate;
    5067                 :         }
    5068                 : 
    5069 GIC      104829 :         old_mode = TUPLOCK_from_mxstatus(old_status);
    5070                 : 
    5071                 :         /*
    5072                 :          * If the lock to be acquired is for the same TransactionId as the
    5073                 :          * existing lock, there's an optimization possible: consider only the
    5074 ECB             :          * strongest of both locks as the only one present, and restart.
    5075                 :          */
    5076 GIC      104829 :         if (xmax == add_to_xmax)
    5077 ECB             :         {
    5078                 :             /*
    5079                 :              * Note that it's not possible for the original tuple to be
    5080                 :              * updated: we wouldn't be here because the tuple would have been
    5081                 :              * invisible and we wouldn't try to update it.  As a subtlety,
    5082                 :              * this code can also run when traversing an update chain to lock
    5083                 :              * future versions of a tuple.  But we wouldn't be here either,
    5084                 :              * because the add_to_xmax would be different from the original
    5085                 :              * updater.
    5086                 :              */
    5087 CBC      103824 :             Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
    5088 ECB             : 
    5089                 :             /* acquire the strongest of both */
    5090 GIC      103824 :             if (mode < old_mode)
    5091           52130 :                 mode = old_mode;
    5092                 :             /* mustn't touch is_update */
    5093                 : 
    5094          103824 :             old_infomask |= HEAP_XMAX_INVALID;
    5095          103824 :             goto l5;
    5096                 :         }
    5097                 : 
    5098                 :         /* otherwise, just fall back to creating a new multixact */
    5099            1005 :         new_status = get_mxact_status_for_lock(mode, is_update);
    5100            1005 :         new_xmax = MultiXactIdCreate(xmax, old_status,
    5101                 :                                      add_to_xmax, new_status);
    5102            1005 :         GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
    5103                 :     }
    5104              14 :     else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
    5105 CBC           5 :              TransactionIdDidCommit(xmax))
    5106               1 :     {
    5107                 :         /*
    5108 ECB             :          * It's a committed update, so we gotta preserve him as updater of the
    5109                 :          * tuple.
    5110                 :          */
    5111                 :         MultiXactStatus status;
    5112                 :         MultiXactStatus new_status;
    5113                 : 
    5114 GIC           1 :         if (old_infomask2 & HEAP_KEYS_UPDATED)
    5115 LBC           0 :             status = MultiXactStatusUpdate;
    5116                 :         else
    5117 GIC           1 :             status = MultiXactStatusNoKeyUpdate;
    5118 ECB             : 
    5119 GIC           1 :         new_status = get_mxact_status_for_lock(mode, is_update);
    5120                 : 
    5121                 :         /*
    5122 EUB             :          * since it's not running, it's obviously impossible for the old
    5123                 :          * updater to be identical to the current one, so we need not check
    5124                 :          * for that case as we do in the block above.
    5125                 :          */
    5126 GIC           1 :         new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
    5127               1 :         GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
    5128                 :     }
    5129                 :     else
    5130                 :     {
    5131                 :         /*
    5132                 :          * Can get here iff the locking/updating transaction was running when
    5133                 :          * the infomask was extracted from the tuple, but finished before
    5134 ECB             :          * TransactionIdIsInProgress got to run.  Deal with it as if there was
    5135                 :          * no locker at all in the first place.
    5136                 :          */
    5137 GIC           8 :         old_infomask |= HEAP_XMAX_INVALID;
    5138               8 :         goto l5;
    5139                 :     }
    5140                 : 
    5141         2117116 :     *result_infomask = new_infomask;
    5142         2117116 :     *result_infomask2 = new_infomask2;
    5143         2117116 :     *result_xmax = new_xmax;
    5144         2117116 : }
    5145                 : 
    5146                 : /*
    5147 ECB             :  * Subroutine for heap_lock_updated_tuple_rec.
    5148                 :  *
    5149                 :  * Given a hypothetical multixact status held by the transaction identified
    5150                 :  * with the given xid, does the current transaction need to wait, fail, or can
    5151                 :  * it continue if it wanted to acquire a lock of the given mode?  "needwait"
    5152                 :  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
    5153                 :  * returned.  If the lock is already held by the current transaction, return
    5154                 :  * TM_SelfModified.  In case of a conflict with another transaction, a
    5155                 :  * different HeapTupleSatisfiesUpdate return code is returned.
    5156                 :  *
    5157                 :  * The held status is said to be hypothetical because it might correspond to a
    5158                 :  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
    5159                 :  * way for simplicity of API.
    5160                 :  */
    5161                 : static TM_Result
    5162 CBC          32 : test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
    5163                 :                            LockTupleMode mode, HeapTuple tup,
    5164                 :                            bool *needwait)
    5165                 : {
    5166                 :     MultiXactStatus wantedstatus;
    5167                 : 
    5168 GIC          32 :     *needwait = false;
    5169              32 :     wantedstatus = get_mxact_status_for_lock(mode, false);
    5170                 : 
    5171 EUB             :     /*
    5172                 :      * Note: we *must* check TransactionIdIsInProgress before
    5173                 :      * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
    5174                 :      * for an explanation.
    5175 ECB             :      */
    5176 CBC          32 :     if (TransactionIdIsCurrentTransactionId(xid))
    5177                 :     {
    5178                 :         /*
    5179                 :          * The tuple has already been locked by our own transaction.  This is
    5180                 :          * very rare but can happen if multiple transactions are trying to
    5181                 :          * lock an ancient version of the same tuple.
    5182                 :          */
    5183 UIC           0 :         return TM_SelfModified;
    5184 ECB             :     }
    5185 GIC          32 :     else if (TransactionIdIsInProgress(xid))
    5186 EUB             :     {
    5187                 :         /*
    5188                 :          * If the locking transaction is running, what we do depends on
    5189                 :          * whether the lock modes conflict: if they do, then we must wait for
    5190 ECB             :          * it to finish; otherwise we can fall through to lock this tuple
    5191                 :          * version without waiting.
    5192                 :          */
    5193 GIC          16 :         if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
    5194              16 :                                 LOCKMODE_from_mxstatus(wantedstatus)))
    5195                 :         {
    5196               8 :             *needwait = true;
    5197                 :         }
    5198                 : 
    5199                 :         /*
    5200                 :          * If we set needwait above, then this value doesn't matter;
    5201                 :          * otherwise, this value signals to caller that it's okay to proceed.
    5202                 :          */
    5203              16 :         return TM_Ok;
    5204                 :     }
    5205 CBC          16 :     else if (TransactionIdDidAbort(xid))
    5206 GIC           3 :         return TM_Ok;
    5207 GBC          13 :     else if (TransactionIdDidCommit(xid))
    5208 EUB             :     {
    5209                 :         /*
    5210                 :          * The other transaction committed.  If it was only a locker, then the
    5211                 :          * lock is completely gone now and we can return success; but if it
    5212                 :          * was an update, then what we do depends on whether the two lock
    5213                 :          * modes conflict.  If they conflict, then we must report error to
    5214                 :          * caller. But if they don't, we can fall through to allow the current
    5215                 :          * transaction to lock the tuple.
    5216 ECB             :          *
    5217                 :          * Note: the reason we worry about ISUPDATE here is because as soon as
    5218                 :          * a transaction ends, all its locks are gone and meaningless, and
    5219                 :          * thus we can ignore them; whereas its updates persist.  In the
    5220 EUB             :          * TransactionIdIsInProgress case, above, we don't need to check
    5221                 :          * because we know the lock is still "alive" and thus a conflict needs
    5222                 :          * always be checked.
    5223                 :          */
    5224 GIC          13 :         if (!ISUPDATE_from_mxstatus(status))
    5225               4 :             return TM_Ok;
    5226                 : 
    5227               9 :         if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
    5228               9 :                                 LOCKMODE_from_mxstatus(wantedstatus)))
    5229 ECB             :         {
    5230                 :             /* bummer */
    5231 CBC           8 :             if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
    5232               6 :                 return TM_Updated;
    5233                 :             else
    5234 GIC           2 :                 return TM_Deleted;
    5235 ECB             :         }
    5236                 : 
    5237 CBC           1 :         return TM_Ok;
    5238                 :     }
    5239                 : 
    5240                 :     /* Not in progress, not aborted, not committed -- must have crashed */
    5241 UIC           0 :     return TM_Ok;
    5242                 : }
    5243                 : 
    5244                 : 
    5245 ECB             : /*
    5246                 :  * Recursive part of heap_lock_updated_tuple
    5247                 :  *
    5248                 :  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
    5249                 :  * xid with the given mode; if this tuple is updated, recurse to lock the new
    5250                 :  * version as well.
    5251                 :  */
    5252                 : static TM_Result
    5253 GIC          80 : heap_lock_updated_tuple_rec(Relation rel, ItemPointer tid, TransactionId xid,
    5254                 :                             LockTupleMode mode)
    5255                 : {
    5256                 :     TM_Result   result;
    5257                 :     ItemPointerData tupid;
    5258                 :     HeapTupleData mytup;
    5259                 :     Buffer      buf;
    5260                 :     uint16      new_infomask,
    5261                 :                 new_infomask2,
    5262                 :                 old_infomask,
    5263                 :                 old_infomask2;
    5264                 :     TransactionId xmax,
    5265                 :                 new_xmax;
    5266 CBC          80 :     TransactionId priorXmax = InvalidTransactionId;
    5267 GIC          80 :     bool        cleared_all_frozen = false;
    5268 ECB             :     bool        pinned_desired_page;
    5269 CBC          80 :     Buffer      vmbuffer = InvalidBuffer;
    5270 ECB             :     BlockNumber block;
    5271                 : 
    5272 CBC          80 :     ItemPointerCopy(tid, &tupid);
    5273 ECB             : 
    5274                 :     for (;;)
    5275                 :     {
    5276 GIC          83 :         new_infomask = 0;
    5277              83 :         new_xmax = InvalidTransactionId;
    5278              83 :         block = ItemPointerGetBlockNumber(&tupid);
    5279              83 :         ItemPointerCopy(&tupid, &(mytup.t_self));
    5280                 : 
    5281              83 :         if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
    5282                 :         {
    5283                 :             /*
    5284                 :              * if we fail to find the updated version of the tuple, it's
    5285                 :              * because it was vacuumed/pruned away after its creator
    5286                 :              * transaction aborted.  So behave as if we got to the end of the
    5287                 :              * chain, and there's no further tuple to lock: return success to
    5288 ECB             :              * caller.
    5289                 :              */
    5290 UBC           0 :             result = TM_Ok;
    5291               0 :             goto out_unlocked;
    5292                 :         }
    5293                 : 
    5294 CBC          83 : l4:
    5295 GIC          91 :         CHECK_FOR_INTERRUPTS();
    5296 EUB             : 
    5297                 :         /*
    5298                 :          * Before locking the buffer, pin the visibility map page if it
    5299                 :          * appears to be necessary.  Since we haven't got the lock yet,
    5300                 :          * someone else might be in the middle of changing this, so we'll need
    5301                 :          * to recheck after we have the lock.
    5302                 :          */
    5303 CBC          91 :         if (PageIsAllVisible(BufferGetPage(buf)))
    5304                 :         {
    5305 UBC           0 :             visibilitymap_pin(rel, block, &vmbuffer);
    5306               0 :             pinned_desired_page = true;
    5307                 :         }
    5308                 :         else
    5309 CBC          91 :             pinned_desired_page = false;
    5310 ECB             : 
    5311 GIC          91 :         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    5312                 : 
    5313                 :         /*
    5314                 :          * If we didn't pin the visibility map page and the page has become
    5315                 :          * all visible while we were busy locking the buffer, we'll have to
    5316                 :          * unlock and re-lock, to avoid holding the buffer lock across I/O.
    5317                 :          * That's a bit unfortunate, but hopefully shouldn't happen often.
    5318                 :          *
    5319                 :          * Note: in some paths through this function, we will reach here
    5320 ECB             :          * holding a pin on a vm page that may or may not be the one matching
    5321                 :          * this page.  If this page isn't all-visible, we won't use the vm
    5322                 :          * page, but we hold onto such a pin till the end of the function.
    5323                 :          */
    5324 GBC          91 :         if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
    5325 EUB             :         {
    5326 UBC           0 :             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    5327 UIC           0 :             visibilitymap_pin(rel, block, &vmbuffer);
    5328 UBC           0 :             LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
    5329 EUB             :         }
    5330                 : 
    5331                 :         /*
    5332                 :          * Check the tuple XMIN against prior XMAX, if any.  If we reached the
    5333                 :          * end of the chain, we're done, so return success.
    5334                 :          */
    5335 GIC          94 :         if (TransactionIdIsValid(priorXmax) &&
    5336               3 :             !TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
    5337                 :                                  priorXmax))
    5338                 :         {
    5339 UIC           0 :             result = TM_Ok;
    5340 UBC           0 :             goto out_locked;
    5341                 :         }
    5342                 : 
    5343                 :         /*
    5344                 :          * Also check Xmin: if this tuple was created by an aborted
    5345                 :          * (sub)transaction, then we already locked the last live one in the
    5346 ECB             :          * chain, thus we're done, so return success.
    5347                 :          */
    5348 GIC          91 :         if (TransactionIdDidAbort(HeapTupleHeaderGetXmin(mytup.t_data)))
    5349 ECB             :         {
    5350 GIC          13 :             result = TM_Ok;
    5351              13 :             goto out_locked;
    5352 ECB             :         }
    5353                 : 
    5354 GIC          78 :         old_infomask = mytup.t_data->t_infomask;
    5355              78 :         old_infomask2 = mytup.t_data->t_infomask2;
    5356              78 :         xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
    5357                 : 
    5358                 :         /*
    5359                 :          * If this tuple version has been updated or locked by some concurrent
    5360                 :          * transaction(s), what we do depends on whether our lock mode
    5361                 :          * conflicts with what those other transactions hold, and also on the
    5362                 :          * status of them.
    5363                 :          */
    5364 CBC          78 :         if (!(old_infomask & HEAP_XMAX_INVALID))
    5365 EUB             :         {
    5366                 :             TransactionId rawxmax;
    5367 ECB             :             bool        needwait;
    5368                 : 
    5369 CBC          30 :             rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
    5370              30 :             if (old_infomask & HEAP_XMAX_IS_MULTI)
    5371                 :             {
    5372 ECB             :                 int         nmembers;
    5373                 :                 int         i;
    5374                 :                 MultiXactMember *members;
    5375                 : 
    5376                 :                 /*
    5377                 :                  * We don't need a test for pg_upgrade'd tuples: this is only
    5378                 :                  * applied to tuples after the first in an update chain.  Said
    5379                 :                  * first tuple in the chain may well be locked-in-9.2-and-
    5380                 :                  * pg_upgraded, but that one was already locked by our caller,
    5381                 :                  * not us; and any subsequent ones cannot be because our
    5382                 :                  * caller must necessarily have obtained a snapshot later than
    5383                 :                  * the pg_upgrade itself.
    5384                 :                  */
    5385 GIC           1 :                 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
    5386 ECB             : 
    5387 GBC           1 :                 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
    5388 GIC           1 :                                                  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
    5389 GBC           4 :                 for (i = 0; i < nmembers; i++)
    5390                 :                 {
    5391 CBC           3 :                     result = test_lockmode_for_conflict(members[i].status,
    5392 GIC           3 :                                                         members[i].xid,
    5393                 :                                                         mode,
    5394 ECB             :                                                         &mytup,
    5395                 :                                                         &needwait);
    5396                 : 
    5397                 :                     /*
    5398                 :                      * If the tuple was already locked by ourselves in a
    5399                 :                      * previous iteration of this (say heap_lock_tuple was
    5400                 :                      * forced to restart the locking loop because of a change
    5401                 :                      * in xmax), then we hold the lock already on this tuple
    5402                 :                      * version and we don't need to do anything; and this is
    5403                 :                      * not an error condition either.  We just need to skip
    5404                 :                      * this tuple and continue locking the next version in the
    5405                 :                      * update chain.
    5406                 :                      */
    5407 CBC           3 :                     if (result == TM_SelfModified)
    5408                 :                     {
    5409 LBC           0 :                         pfree(members);
    5410               0 :                         goto next;
    5411                 :                     }
    5412 ECB             : 
    5413 CBC           3 :                     if (needwait)
    5414 ECB             :                     {
    5415 LBC           0 :                         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    5416               0 :                         XactLockTableWait(members[i].xid, rel,
    5417                 :                                           &mytup.t_self,
    5418 ECB             :                                           XLTW_LockUpdated);
    5419 UIC           0 :                         pfree(members);
    5420 LBC           0 :                         goto l4;
    5421                 :                     }
    5422 CBC           3 :                     if (result != TM_Ok)
    5423                 :                     {
    5424 UIC           0 :                         pfree(members);
    5425 LBC           0 :                         goto out_locked;
    5426                 :                     }
    5427 ECB             :                 }
    5428 GIC           1 :                 if (members)
    5429 CBC           1 :                     pfree(members);
    5430 ECB             :             }
    5431                 :             else
    5432                 :             {
    5433                 :                 MultiXactStatus status;
    5434                 : 
    5435                 :                 /*
    5436                 :                  * For a non-multi Xmax, we first need to compute the
    5437                 :                  * corresponding MultiXactStatus by using the infomask bits.
    5438                 :                  */
    5439 CBC          29 :                 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
    5440 ECB             :                 {
    5441 CBC          10 :                     if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
    5442 GIC          10 :                         status = MultiXactStatusForKeyShare;
    5443 UIC           0 :                     else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
    5444               0 :                         status = MultiXactStatusForShare;
    5445               0 :                     else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
    5446 ECB             :                     {
    5447 LBC           0 :                         if (old_infomask2 & HEAP_KEYS_UPDATED)
    5448 UIC           0 :                             status = MultiXactStatusForUpdate;
    5449 ECB             :                         else
    5450 LBC           0 :                             status = MultiXactStatusForNoKeyUpdate;
    5451 EUB             :                     }
    5452                 :                     else
    5453 ECB             :                     {
    5454                 :                         /*
    5455                 :                          * LOCK_ONLY present alone (a pg_upgraded tuple marked
    5456                 :                          * as share-locked in the old cluster) shouldn't be
    5457                 :                          * seen in the middle of an update chain.
    5458                 :                          */
    5459 UIC           0 :                         elog(ERROR, "invalid lock status in tuple");
    5460                 :                     }
    5461                 :                 }
    5462                 :                 else
    5463                 :                 {
    5464                 :                     /* it's an update, but which kind? */
    5465 GIC          19 :                     if (old_infomask2 & HEAP_KEYS_UPDATED)
    5466              14 :                         status = MultiXactStatusUpdate;
    5467                 :                     else
    5468               5 :                         status = MultiXactStatusNoKeyUpdate;
    5469                 :                 }
    5470                 : 
    5471              29 :                 result = test_lockmode_for_conflict(status, rawxmax, mode,
    5472                 :                                                     &mytup, &needwait);
    5473                 : 
    5474                 :                 /*
    5475                 :                  * If the tuple was already locked by ourselves in a previous
    5476                 :                  * iteration of this (say heap_lock_tuple was forced to
    5477                 :                  * restart the locking loop because of a change in xmax), then
    5478                 :                  * we hold the lock already on this tuple version and we don't
    5479 ECB             :                  * need to do anything; and this is not an error condition
    5480                 :                  * either.  We just need to skip this tuple and continue
    5481                 :                  * locking the next version in the update chain.
    5482                 :                  */
    5483 GIC          29 :                 if (result == TM_SelfModified)
    5484 UIC           0 :                     goto next;
    5485                 : 
    5486 CBC          29 :                 if (needwait)
    5487 ECB             :                 {
    5488 GIC           8 :                     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    5489               8 :                     XactLockTableWait(rawxmax, rel, &mytup.t_self,
    5490                 :                                       XLTW_LockUpdated);
    5491               8 :                     goto l4;
    5492                 :                 }
    5493              21 :                 if (result != TM_Ok)
    5494                 :                 {
    5495               8 :                     goto out_locked;
    5496                 :                 }
    5497                 :             }
    5498 ECB             :         }
    5499                 : 
    5500                 :         /* compute the new Xmax and infomask values for the tuple ... */
    5501 GIC          62 :         compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
    5502                 :                                   xid, mode, false,
    5503                 :                                   &new_xmax, &new_infomask, &new_infomask2);
    5504 ECB             : 
    5505 GIC          62 :         if (PageIsAllVisible(BufferGetPage(buf)) &&
    5506 UIC           0 :             visibilitymap_clear(rel, block, vmbuffer,
    5507                 :                                 VISIBILITYMAP_ALL_FROZEN))
    5508               0 :             cleared_all_frozen = true;
    5509                 : 
    5510 GIC          62 :         START_CRIT_SECTION();
    5511                 : 
    5512                 :         /* ... and set them */
    5513              62 :         HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
    5514              62 :         mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
    5515              62 :         mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    5516              62 :         mytup.t_data->t_infomask |= new_infomask;
    5517              62 :         mytup.t_data->t_infomask2 |= new_infomask2;
    5518                 : 
    5519              62 :         MarkBufferDirty(buf);
    5520                 : 
    5521                 :         /* XLOG stuff */
    5522              62 :         if (RelationNeedsWAL(rel))
    5523                 :         {
    5524 ECB             :             xl_heap_lock_updated xlrec;
    5525                 :             XLogRecPtr  recptr;
    5526 GIC          62 :             Page        page = BufferGetPage(buf);
    5527                 : 
    5528              62 :             XLogBeginInsert();
    5529 CBC          62 :             XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    5530                 : 
    5531 GIC          62 :             xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
    5532 CBC          62 :             xlrec.xmax = new_xmax;
    5533              62 :             xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
    5534              62 :             xlrec.flags =
    5535 GIC          62 :                 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
    5536 ECB             : 
    5537 CBC          62 :             XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
    5538 ECB             : 
    5539 GIC          62 :             recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
    5540 ECB             : 
    5541 GBC          62 :             PageSetLSN(page, recptr);
    5542                 :         }
    5543 ECB             : 
    5544 GIC          62 :         END_CRIT_SECTION();
    5545                 : 
    5546 CBC          62 : next:
    5547                 :         /* if we find the end of update chain, we're done. */
    5548             124 :         if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
    5549 GIC         124 :             HeapTupleHeaderIndicatesMovedPartitions(mytup.t_data) ||
    5550 CBC          66 :             ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
    5551 GIC           4 :             HeapTupleHeaderIsOnlyLocked(mytup.t_data))
    5552                 :         {
    5553              59 :             result = TM_Ok;
    5554              59 :             goto out_locked;
    5555                 :         }
    5556 ECB             : 
    5557                 :         /* tail recursion */
    5558 GIC           3 :         priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
    5559 CBC           3 :         ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
    5560 GIC           3 :         UnlockReleaseBuffer(buf);
    5561                 :     }
    5562                 : 
    5563                 :     result = TM_Ok;
    5564 ECB             : 
    5565 GIC          80 : out_locked:
    5566 CBC          80 :     UnlockReleaseBuffer(buf);
    5567                 : 
    5568 GIC          80 : out_unlocked:
    5569 CBC          80 :     if (vmbuffer != InvalidBuffer)
    5570 UIC           0 :         ReleaseBuffer(vmbuffer);
    5571 ECB             : 
    5572 CBC          80 :     return result;
    5573                 : }
    5574 ECB             : 
    5575                 : /*
    5576                 :  * heap_lock_updated_tuple
    5577                 :  *      Follow update chain when locking an updated tuple, acquiring locks (row
    5578                 :  *      marks) on the updated versions.
    5579                 :  *
    5580                 :  * The initial tuple is assumed to be already locked.
    5581                 :  *
    5582                 :  * This function doesn't check visibility, it just unconditionally marks the
    5583                 :  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
    5584                 :  * concurrently (or updated with the key being modified), sleep until the
    5585                 :  * transaction doing it is finished.
    5586                 :  *
    5587                 :  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
    5588                 :  * when we have to wait for other transactions to release them, as opposed to
    5589                 :  * what heap_lock_tuple does.  The reason is that having more than one
    5590                 :  * transaction walking the chain is probably uncommon enough that risk of
    5591                 :  * starvation is not likely: one of the preconditions for being here is that
    5592                 :  * the snapshot in use predates the update that created this tuple (because we
    5593                 :  * started at an earlier version of the tuple), but at the same time such a
    5594                 :  * transaction cannot be using repeatable read or serializable isolation
    5595                 :  * levels, because that would lead to a serializability failure.
    5596                 :  */
    5597                 : static TM_Result
    5598 GIC          88 : heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
    5599                 :                         TransactionId xid, LockTupleMode mode)
    5600                 : {
    5601                 :     /*
    5602                 :      * If the tuple has not been updated, or has moved into another partition
    5603                 :      * (effectively a delete) stop here.
    5604                 :      */
    5605              88 :     if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
    5606              86 :         !ItemPointerEquals(&tuple->t_self, ctid))
    5607                 :     {
    5608                 :         /*
    5609                 :          * If this is the first possibly-multixact-able operation in the
    5610                 :          * current transaction, set my per-backend OldestMemberMXactId
    5611 ECB             :          * setting. We can be certain that the transaction will never become a
    5612                 :          * member of any older MultiXactIds than that.  (We have to do this
    5613                 :          * even if we end up just using our own TransactionId below, since
    5614                 :          * some other backend could incorporate our XID into a MultiXact
    5615                 :          * immediately afterwards.)
    5616                 :          */
    5617 GIC          80 :         MultiXactIdSetOldestMember();
    5618                 : 
    5619              80 :         return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
    5620                 :     }
    5621 ECB             : 
    5622                 :     /* nothing to lock */
    5623 CBC           8 :     return TM_Ok;
    5624 ECB             : }
    5625                 : 
    5626                 : /*
    5627                 :  *  heap_finish_speculative - mark speculative insertion as successful
    5628                 :  *
    5629                 :  * To successfully finish a speculative insertion we have to clear speculative
    5630                 :  * token from tuple.  To do so the t_ctid field, which will contain a
    5631                 :  * speculative token value, is modified in place to point to the tuple itself,
    5632                 :  * which is characteristic of a newly inserted ordinary tuple.
    5633                 :  *
    5634                 :  * NB: It is not ok to commit without either finishing or aborting a
    5635                 :  * speculative insertion.  We could treat speculative tuples of committed
    5636                 :  * transactions implicitly as completed, but then we would have to be prepared
    5637                 :  * to deal with speculative tokens on committed tuples.  That wouldn't be
    5638                 :  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
    5639                 :  * but clearing the token at completion isn't very expensive either.
    5640                 :  * An explicit confirmation WAL record also makes logical decoding simpler.
    5641                 :  */
    5642                 : void
    5643 GIC        2005 : heap_finish_speculative(Relation relation, ItemPointer tid)
    5644                 : {
    5645                 :     Buffer      buffer;
    5646                 :     Page        page;
    5647 ECB             :     OffsetNumber offnum;
    5648 GBC        2005 :     ItemId      lp = NULL;
    5649 ECB             :     HeapTupleHeader htup;
    5650 EUB             : 
    5651 CBC        2005 :     buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
    5652 GIC        2005 :     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    5653            2005 :     page = (Page) BufferGetPage(buffer);
    5654                 : 
    5655            2005 :     offnum = ItemPointerGetOffsetNumber(tid);
    5656            2005 :     if (PageGetMaxOffsetNumber(page) >= offnum)
    5657            2005 :         lp = PageGetItemId(page, offnum);
    5658                 : 
    5659 CBC        2005 :     if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    5660 UIC           0 :         elog(ERROR, "invalid lp");
    5661                 : 
    5662 GIC        2005 :     htup = (HeapTupleHeader) PageGetItem(page, lp);
    5663                 : 
    5664                 :     /* NO EREPORT(ERROR) from here till changes are logged */
    5665            2005 :     START_CRIT_SECTION();
    5666                 : 
    5667 CBC        2005 :     Assert(HeapTupleHeaderIsSpeculative(htup));
    5668 ECB             : 
    5669 GBC        2005 :     MarkBufferDirty(buffer);
    5670                 : 
    5671 ECB             :     /*
    5672                 :      * Replace the speculative insertion token with a real t_ctid, pointing to
    5673                 :      * itself like it does on regular tuples.
    5674                 :      */
    5675 CBC        2005 :     htup->t_ctid = *tid;
    5676 ECB             : 
    5677                 :     /* XLOG stuff */
    5678 GIC        2005 :     if (RelationNeedsWAL(relation))
    5679                 :     {
    5680                 :         xl_heap_confirm xlrec;
    5681                 :         XLogRecPtr  recptr;
    5682                 : 
    5683 CBC        1999 :         xlrec.offnum = ItemPointerGetOffsetNumber(tid);
    5684                 : 
    5685 GIC        1999 :         XLogBeginInsert();
    5686 ECB             : 
    5687                 :         /* We want the same filtering on this as on a plain insert */
    5688 CBC        1999 :         XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
    5689                 : 
    5690 GIC        1999 :         XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
    5691            1999 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    5692                 : 
    5693            1999 :         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
    5694                 : 
    5695            1999 :         PageSetLSN(page, recptr);
    5696 ECB             :     }
    5697                 : 
    5698 GIC        2005 :     END_CRIT_SECTION();
    5699                 : 
    5700            2005 :     UnlockReleaseBuffer(buffer);
    5701 CBC        2005 : }
    5702 ECB             : 
    5703                 : /*
    5704                 :  *  heap_abort_speculative - kill a speculatively inserted tuple
    5705                 :  *
    5706                 :  * Marks a tuple that was speculatively inserted in the same command as dead,
    5707                 :  * by setting its xmin as invalid.  That makes it immediately appear as dead
    5708                 :  * to all transactions, including our own.  In particular, it makes
    5709                 :  * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
    5710                 :  * inserting a duplicate key value won't unnecessarily wait for our whole
    5711                 :  * transaction to finish (it'll just wait for our speculative insertion to
    5712                 :  * finish).
    5713                 :  *
    5714                 :  * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
    5715                 :  * that arise due to a mutual dependency that is not user visible.  By
    5716                 :  * definition, unprincipled deadlocks cannot be prevented by the user
    5717                 :  * reordering lock acquisition in client code, because the implementation level
    5718                 :  * lock acquisitions are not under the user's direct control.  If speculative
    5719                 :  * inserters did not take this precaution, then under high concurrency they
    5720                 :  * could deadlock with each other, which would not be acceptable.
    5721                 :  *
    5722                 :  * This is somewhat redundant with heap_delete, but we prefer to have a
    5723                 :  * dedicated routine with stripped down requirements.  Note that this is also
    5724                 :  * used to delete the TOAST tuples created during speculative insertion.
    5725                 :  *
    5726                 :  * This routine does not affect logical decoding as it only looks at
    5727                 :  * confirmation records.
    5728                 :  */
    5729                 : void
    5730 GIC          10 : heap_abort_speculative(Relation relation, ItemPointer tid)
    5731                 : {
    5732              10 :     TransactionId xid = GetCurrentTransactionId();
    5733                 :     ItemId      lp;
    5734 ECB             :     HeapTupleData tp;
    5735                 :     Page        page;
    5736                 :     BlockNumber block;
    5737                 :     Buffer      buffer;
    5738                 :     TransactionId prune_xid;
    5739                 : 
    5740 GIC          10 :     Assert(ItemPointerIsValid(tid));
    5741                 : 
    5742              10 :     block = ItemPointerGetBlockNumber(tid);
    5743              10 :     buffer = ReadBuffer(relation, block);
    5744              10 :     page = BufferGetPage(buffer);
    5745                 : 
    5746              10 :     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    5747                 : 
    5748                 :     /*
    5749                 :      * Page can't be all visible, we just inserted into it, and are still
    5750                 :      * running.
    5751                 :      */
    5752              10 :     Assert(!PageIsAllVisible(page));
    5753                 : 
    5754              10 :     lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
    5755              10 :     Assert(ItemIdIsNormal(lp));
    5756                 : 
    5757              10 :     tp.t_tableOid = RelationGetRelid(relation);
    5758              10 :     tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
    5759              10 :     tp.t_len = ItemIdGetLength(lp);
    5760 CBC          10 :     tp.t_self = *tid;
    5761                 : 
    5762                 :     /*
    5763                 :      * Sanity check that the tuple really is a speculatively inserted tuple,
    5764                 :      * inserted by us.
    5765 ECB             :      */
    5766 GIC          10 :     if (tp.t_data->t_choice.t_heap.t_xmin != xid)
    5767 UIC           0 :         elog(ERROR, "attempted to kill a tuple inserted by another transaction");
    5768 GIC          10 :     if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
    5769 UIC           0 :         elog(ERROR, "attempted to kill a non-speculative tuple");
    5770 GIC          10 :     Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data));
    5771                 : 
    5772                 :     /*
    5773                 :      * No need to check for serializable conflicts here.  There is never a
    5774                 :      * need for a combo CID, either.  No need to extract replica identity, or
    5775                 :      * do anything special with infomask bits.
    5776 ECB             :      */
    5777 EUB             : 
    5778 GIC          10 :     START_CRIT_SECTION();
    5779                 : 
    5780                 :     /*
    5781 ECB             :      * The tuple will become DEAD immediately.  Flag that this page is a
    5782                 :      * candidate for pruning by setting xmin to TransactionXmin. While not
    5783                 :      * immediately prunable, it is the oldest xid we can cheaply determine
    5784                 :      * that's safe against wraparound / being older than the table's
    5785                 :      * relfrozenxid.  To defend against the unlikely case of a new relation
    5786                 :      * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
    5787                 :      * if so (vacuum can't subsequently move relfrozenxid to beyond
    5788                 :      * TransactionXmin, so there's no race here).
    5789                 :      */
    5790 GBC          10 :     Assert(TransactionIdIsValid(TransactionXmin));
    5791 GIC          10 :     if (TransactionIdPrecedes(TransactionXmin, relation->rd_rel->relfrozenxid))
    5792 LBC           0 :         prune_xid = relation->rd_rel->relfrozenxid;
    5793                 :     else
    5794 CBC          10 :         prune_xid = TransactionXmin;
    5795              10 :     PageSetPrunable(page, prune_xid);
    5796 ECB             : 
    5797 EUB             :     /* store transaction information of xact deleting the tuple */
    5798 GIC          10 :     tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    5799              10 :     tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    5800 ECB             : 
    5801                 :     /*
    5802                 :      * Set the tuple header xmin to InvalidTransactionId.  This makes the
    5803                 :      * tuple immediately invisible everyone.  (In particular, to any
    5804                 :      * transactions waiting on the speculative token, woken up later.)
    5805                 :      */
    5806 CBC          10 :     HeapTupleHeaderSetXmin(tp.t_data, InvalidTransactionId);
    5807                 : 
    5808                 :     /* Clear the speculative insertion token too */
    5809              10 :     tp.t_data->t_ctid = tp.t_self;
    5810                 : 
    5811 GIC          10 :     MarkBufferDirty(buffer);
    5812                 : 
    5813                 :     /*
    5814 ECB             :      * XLOG stuff
    5815                 :      *
    5816                 :      * The WAL records generated here match heap_delete().  The same recovery
    5817                 :      * routines are used.
    5818                 :      */
    5819 CBC          10 :     if (RelationNeedsWAL(relation))
    5820 ECB             :     {
    5821                 :         xl_heap_delete xlrec;
    5822                 :         XLogRecPtr  recptr;
    5823                 : 
    5824 CBC          10 :         xlrec.flags = XLH_DELETE_IS_SUPER;
    5825 GIC          20 :         xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
    5826 CBC          10 :                                               tp.t_data->t_infomask2);
    5827 GIC          10 :         xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self);
    5828              10 :         xlrec.xmax = xid;
    5829 ECB             : 
    5830 GIC          10 :         XLogBeginInsert();
    5831 CBC          10 :         XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
    5832 GIC          10 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    5833                 : 
    5834                 :         /* No replica identity & replication origin logged */
    5835                 : 
    5836              10 :         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
    5837                 : 
    5838              10 :         PageSetLSN(page, recptr);
    5839 ECB             :     }
    5840                 : 
    5841 CBC          10 :     END_CRIT_SECTION();
    5842                 : 
    5843 GIC          10 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    5844                 : 
    5845              10 :     if (HeapTupleHasExternal(&tp))
    5846                 :     {
    5847               1 :         Assert(!IsToastRelation(relation));
    5848               1 :         heap_toast_delete(relation, &tp, true);
    5849                 :     }
    5850                 : 
    5851                 :     /*
    5852                 :      * Never need to mark tuple for invalidation, since catalogs don't support
    5853                 :      * speculative insertion
    5854                 :      */
    5855                 : 
    5856                 :     /* Now we can release the buffer */
    5857              10 :     ReleaseBuffer(buffer);
    5858                 : 
    5859                 :     /* count deletion, as we counted the insertion too */
    5860              10 :     pgstat_count_heap_delete(relation);
    5861              10 : }
    5862                 : 
    5863                 : /*
    5864                 :  * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
    5865                 :  *
    5866                 :  * Overwriting violates both MVCC and transactional safety, so the uses
    5867                 :  * of this function in Postgres are extremely limited.  Nonetheless we
    5868                 :  * find some places to use it.
    5869                 :  *
    5870                 :  * The tuple cannot change size, and therefore it's reasonable to assume
    5871                 :  * that its null bitmap (if any) doesn't change either.  So we just
    5872                 :  * overwrite the data portion of the tuple without touching the null
    5873                 :  * bitmap or any of the header fields.
    5874                 :  *
    5875                 :  * tuple is an in-memory tuple structure containing the data to be written
    5876                 :  * over the target tuple.  Also, tuple->t_self identifies the target tuple.
    5877                 :  *
    5878                 :  * Note that the tuple updated here had better not come directly from the
    5879                 :  * syscache if the relation has a toast relation as this tuple could
    5880                 :  * include toast values that have been expanded, causing a failure here.
    5881                 :  */
    5882                 : void
    5883          184243 : heap_inplace_update(Relation relation, HeapTuple tuple)
    5884                 : {
    5885                 :     Buffer      buffer;
    5886                 :     Page        page;
    5887                 :     OffsetNumber offnum;
    5888          184243 :     ItemId      lp = NULL;
    5889                 :     HeapTupleHeader htup;
    5890                 :     uint32      oldlen;
    5891                 :     uint32      newlen;
    5892                 : 
    5893                 :     /*
    5894                 :      * For now, we don't allow parallel updates.  Unlike a regular update,
    5895                 :      * this should never create a combo CID, so it might be possible to relax
    5896 ECB             :      * this restriction, but not without more thought and testing.  It's not
    5897                 :      * clear that it would be useful, anyway.
    5898                 :      */
    5899 GIC      184243 :     if (IsInParallelMode())
    5900 UIC           0 :         ereport(ERROR,
    5901                 :                 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
    5902                 :                  errmsg("cannot update tuples during a parallel operation")));
    5903                 : 
    5904 GIC      184243 :     buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&(tuple->t_self)));
    5905          184243 :     LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
    5906          184243 :     page = (Page) BufferGetPage(buffer);
    5907                 : 
    5908          184243 :     offnum = ItemPointerGetOffsetNumber(&(tuple->t_self));
    5909          184243 :     if (PageGetMaxOffsetNumber(page) >= offnum)
    5910          184243 :         lp = PageGetItemId(page, offnum);
    5911 ECB             : 
    5912 GIC      184243 :     if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    5913 UIC           0 :         elog(ERROR, "invalid lp");
    5914 ECB             : 
    5915 GIC      184243 :     htup = (HeapTupleHeader) PageGetItem(page, lp);
    5916 ECB             : 
    5917 CBC      184243 :     oldlen = ItemIdGetLength(lp) - htup->t_hoff;
    5918 GIC      184243 :     newlen = tuple->t_len - tuple->t_data->t_hoff;
    5919 GBC      184243 :     if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
    5920 UBC           0 :         elog(ERROR, "wrong tuple length");
    5921 EUB             : 
    5922                 :     /* NO EREPORT(ERROR) from here till changes are logged */
    5923 CBC      184243 :     START_CRIT_SECTION();
    5924 EUB             : 
    5925 GIC      184243 :     memcpy((char *) htup + htup->t_hoff,
    5926          184243 :            (char *) tuple->t_data + tuple->t_data->t_hoff,
    5927                 :            newlen);
    5928 ECB             : 
    5929 GIC      184243 :     MarkBufferDirty(buffer);
    5930                 : 
    5931                 :     /* XLOG stuff */
    5932          184243 :     if (RelationNeedsWAL(relation))
    5933                 :     {
    5934                 :         xl_heap_inplace xlrec;
    5935                 :         XLogRecPtr  recptr;
    5936                 : 
    5937          184235 :         xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
    5938 ECB             : 
    5939 CBC      184235 :         XLogBeginInsert();
    5940 GBC      184235 :         XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
    5941                 : 
    5942 GIC      184235 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    5943          184235 :         XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
    5944                 : 
    5945 ECB             :         /* inplace updates aren't decoded atm, don't log the origin */
    5946                 : 
    5947 CBC      184235 :         recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
    5948 ECB             : 
    5949 CBC      184235 :         PageSetLSN(page, recptr);
    5950                 :     }
    5951                 : 
    5952 GIC      184243 :     END_CRIT_SECTION();
    5953 EUB             : 
    5954 GBC      184243 :     UnlockReleaseBuffer(buffer);
    5955 EUB             : 
    5956                 :     /*
    5957                 :      * Send out shared cache inval if necessary.  Note that because we only
    5958                 :      * pass the new version of the tuple, this mustn't be used for any
    5959                 :      * operations that could change catcache lookup keys.  But we aren't
    5960                 :      * bothering with index updates either, so that's true a fortiori.
    5961                 :      */
    5962 GIC      184243 :     if (!IsBootstrapProcessingMode())
    5963           93963 :         CacheInvalidateHeapTuple(relation, tuple, NULL);
    5964          184243 : }
    5965                 : 
    5966                 : #define     FRM_NOOP                0x0001
    5967 EUB             : #define     FRM_INVALIDATE_XMAX     0x0002
    5968                 : #define     FRM_RETURN_IS_XID       0x0004
    5969                 : #define     FRM_RETURN_IS_MULTI     0x0008
    5970                 : #define     FRM_MARK_COMMITTED      0x0010
    5971                 : 
    5972                 : /*
    5973                 :  * FreezeMultiXactId
    5974                 :  *      Determine what to do during freezing when a tuple is marked by a
    5975                 :  *      MultiXactId.
    5976                 :  *
    5977                 :  * "flags" is an output value; it's used to tell caller what to do on return.
    5978                 :  * "pagefrz" is an input/output value, used to manage page level freezing.
    5979                 :  *
    5980                 :  * Possible values that we can set in "flags":
    5981                 :  * FRM_NOOP
    5982                 :  *      don't do anything -- keep existing Xmax
    5983                 :  * FRM_INVALIDATE_XMAX
    5984                 :  *      mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
    5985                 :  * FRM_RETURN_IS_XID
    5986                 :  *      The Xid return value is a single update Xid to set as xmax.
    5987                 :  * FRM_MARK_COMMITTED
    5988 ECB             :  *      Xmax can be marked as HEAP_XMAX_COMMITTED
    5989                 :  * FRM_RETURN_IS_MULTI
    5990                 :  *      The return value is a new MultiXactId to set as new Xmax.
    5991                 :  *      (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
    5992                 :  *
    5993                 :  * Caller delegates control of page freezing to us.  In practice we always
    5994                 :  * force freezing of caller's page unless FRM_NOOP processing is indicated.
    5995                 :  * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
    5996                 :  * can never be left behind.  We freely choose when and how to process each
    5997                 :  * Multi, without ever violating the cutoff postconditions for freezing.
    5998 EUB             :  *
    5999                 :  * It's useful to remove Multis on a proactive timeline (relative to freezing
    6000                 :  * XIDs) to keep MultiXact member SLRU buffer misses to a minimum.  It can also
    6001                 :  * be cheaper in the short run, for us, since we too can avoid SLRU buffer
    6002                 :  * misses through eager processing.
    6003                 :  *
    6004                 :  * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
    6005                 :  * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
    6006                 :  * This can usually be put off, which is usually enough to avoid it altogether.
    6007                 :  * Allocating new multis during VACUUM should be avoided on general principle;
    6008                 :  * only VACUUM can advance relminmxid, so allocating new Multis here comes with
    6009                 :  * its own special risks.
    6010                 :  *
    6011                 :  * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
    6012                 :  * using heap_tuple_should_freeze when we haven't forced page-level freezing.
    6013                 :  *
    6014                 :  * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
    6015                 :  * have already forced page-level freezing, since that might incur the same
    6016                 :  * SLRU buffer misses that we specifically intended to avoid by freezing.
    6017                 :  */
    6018                 : static TransactionId
    6019 GIC           6 : FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
    6020                 :                   const struct VacuumCutoffs *cutoffs, uint16 *flags,
    6021                 :                   HeapPageFreeze *pagefrz)
    6022                 : {
    6023                 :     TransactionId newxmax;
    6024                 :     MultiXactMember *members;
    6025                 :     int         nmembers;
    6026                 :     bool        need_replace;
    6027                 :     int         nnewmembers;
    6028                 :     MultiXactMember *newmembers;
    6029                 :     bool        has_lockers;
    6030                 :     TransactionId update_xid;
    6031                 :     bool        update_committed;
    6032                 :     TransactionId FreezePageRelfrozenXid;
    6033                 : 
    6034 CBC           6 :     *flags = 0;
    6035 ECB             : 
    6036                 :     /* We should only be called in Multis */
    6037 GIC           6 :     Assert(t_infomask & HEAP_XMAX_IS_MULTI);
    6038 ECB             : 
    6039 GIC           6 :     if (!MultiXactIdIsValid(multi) ||
    6040 CBC           6 :         HEAP_LOCKED_UPGRADED(t_infomask))
    6041                 :     {
    6042 UIC           0 :         *flags |= FRM_INVALIDATE_XMAX;
    6043 UNC           0 :         pagefrz->freeze_required = true;
    6044 UIC           0 :         return InvalidTransactionId;
    6045 ECB             :     }
    6046 GNC           6 :     else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
    6047 UIC           0 :         ereport(ERROR,
    6048 ECB             :                 (errcode(ERRCODE_DATA_CORRUPTED),
    6049 EUB             :                  errmsg_internal("found multixact %u from before relminmxid %u",
    6050                 :                                  multi, cutoffs->relminmxid)));
    6051 GNC           6 :     else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
    6052                 :     {
    6053                 :         TransactionId update_xact;
    6054                 : 
    6055 ECB             :         /*
    6056                 :          * This old multi cannot possibly have members still running, but
    6057                 :          * verify just in case.  If it was a locker only, it can be removed
    6058                 :          * without any further consideration; but if it contained an update,
    6059                 :          * we might need to preserve it.
    6060                 :          */
    6061 GIC           4 :         if (MultiXactIdIsRunning(multi,
    6062               4 :                                  HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
    6063 UIC           0 :             ereport(ERROR,
    6064                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6065                 :                      errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
    6066                 :                                      multi, cutoffs->OldestMxact)));
    6067 ECB             : 
    6068 GBC           4 :         if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
    6069 ECB             :         {
    6070 CBC           4 :             *flags |= FRM_INVALIDATE_XMAX;
    6071 GNC           4 :             pagefrz->freeze_required = true;
    6072               4 :             return InvalidTransactionId;
    6073                 :         }
    6074                 : 
    6075                 :         /* replace multi with single XID for its updater? */
    6076 UNC           0 :         update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
    6077               0 :         if (TransactionIdPrecedes(update_xact, cutoffs->relfrozenxid))
    6078               0 :             ereport(ERROR,
    6079                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6080                 :                      errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
    6081                 :                                      multi, update_xact,
    6082                 :                                      cutoffs->relfrozenxid)));
    6083               0 :         else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
    6084                 :         {
    6085                 :             /*
    6086                 :              * Updater XID has to have aborted (otherwise the tuple would have
    6087                 :              * been pruned away instead, since updater XID is < OldestXmin).
    6088                 :              * Just remove xmax.
    6089                 :              */
    6090               0 :             if (TransactionIdDidCommit(update_xact))
    6091 LBC           0 :                 ereport(ERROR,
    6092 ECB             :                         (errcode(ERRCODE_DATA_CORRUPTED),
    6093                 :                          errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
    6094                 :                                          multi, update_xact,
    6095                 :                                          cutoffs->OldestXmin)));
    6096 UNC           0 :             *flags |= FRM_INVALIDATE_XMAX;
    6097               0 :             pagefrz->freeze_required = true;
    6098               0 :             return InvalidTransactionId;
    6099 ECB             :         }
    6100                 : 
    6101                 :         /* Have to keep updater XID as new xmax */
    6102 UNC           0 :         *flags |= FRM_RETURN_IS_XID;
    6103               0 :         pagefrz->freeze_required = true;
    6104               0 :         return update_xact;
    6105                 :     }
    6106 ECB             : 
    6107                 :     /*
    6108                 :      * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
    6109                 :      * need to walk the whole members array to figure out what to do, if
    6110                 :      * anything.
    6111                 :      */
    6112                 :     nmembers =
    6113 GIC           2 :         GetMultiXactIdMembers(multi, &members, false,
    6114               2 :                               HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
    6115               2 :     if (nmembers <= 0)
    6116                 :     {
    6117                 :         /* Nothing worth keeping */
    6118 UIC           0 :         *flags |= FRM_INVALIDATE_XMAX;
    6119 UNC           0 :         pagefrz->freeze_required = true;
    6120 UIC           0 :         return InvalidTransactionId;
    6121                 :     }
    6122 EUB             : 
    6123                 :     /*
    6124                 :      * The FRM_NOOP case is the only case where we might need to ratchet back
    6125                 :      * FreezePageRelfrozenXid or FreezePageRelminMxid.  It is also the only
    6126                 :      * case where our caller might ratchet back its NoFreezePageRelfrozenXid
    6127                 :      * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
    6128                 :      * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
    6129                 :      * trackers managed by VACUUM being ratcheting back by xmax to the degree
    6130                 :      * required to make it safe to leave xmax undisturbed, independent of
    6131                 :      * whether or not page freezing is triggered somewhere else.
    6132                 :      *
    6133                 :      * Our policy is to force freezing in every case other than FRM_NOOP,
    6134                 :      * which obviates the need to maintain either set of trackers, anywhere.
    6135                 :      * Every other case will reliably execute a freeze plan for xmax that
    6136                 :      * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
    6137                 :      * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
    6138                 :      * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
    6139                 :      * OldestXmin/OldestMxact, so later values never need to be tracked here.)
    6140                 :      */
    6141 GIC           2 :     need_replace = false;
    6142 GNC           2 :     FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
    6143               4 :     for (int i = 0; i < nmembers; i++)
    6144                 :     {
    6145               3 :         TransactionId xid = members[i].xid;
    6146                 : 
    6147               3 :         Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
    6148                 : 
    6149               3 :         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
    6150                 :         {
    6151                 :             /* Can't violate the FreezeLimit postcondition */
    6152 GIC           1 :             need_replace = true;
    6153               1 :             break;
    6154                 :         }
    6155 GNC           2 :         if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
    6156 UNC           0 :             FreezePageRelfrozenXid = xid;
    6157 EUB             :     }
    6158                 : 
    6159                 :     /* Can't violate the MultiXactCutoff postcondition, either */
    6160 GNC           2 :     if (!need_replace)
    6161               1 :         need_replace = MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff);
    6162                 : 
    6163 GIC           2 :     if (!need_replace)
    6164                 :     {
    6165                 :         /*
    6166                 :          * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
    6167                 :          * both together to make it safe to retain this particular multi after
    6168                 :          * freezing its page
    6169                 :          */
    6170               1 :         *flags |= FRM_NOOP;
    6171 GNC           1 :         pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
    6172               1 :         if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
    6173 UNC           0 :             pagefrz->FreezePageRelminMxid = multi;
    6174 GIC           1 :         pfree(members);
    6175               1 :         return multi;
    6176                 :     }
    6177                 : 
    6178 EUB             :     /*
    6179                 :      * Do a more thorough second pass over the multi to figure out which
    6180                 :      * member XIDs actually need to be kept.  Checking the precise status of
    6181                 :      * individual members might even show that we don't need to keep anything.
    6182                 :      * That is quite possible even though the Multi must be >= OldestMxact,
    6183                 :      * since our second pass only keeps member XIDs when it's truly necessary;
    6184                 :      * even member XIDs >= OldestXmin often won't be kept by second pass.
    6185                 :      */
    6186 GIC           1 :     nnewmembers = 0;
    6187               1 :     newmembers = palloc(sizeof(MultiXactMember) * nmembers);
    6188 GBC           1 :     has_lockers = false;
    6189               1 :     update_xid = InvalidTransactionId;
    6190 GIC           1 :     update_committed = false;
    6191                 : 
    6192                 :     /*
    6193                 :      * Determine whether to keep each member xid, or to ignore it instead
    6194                 :      */
    6195 GNC           3 :     for (int i = 0; i < nmembers; i++)
    6196                 :     {
    6197               2 :         TransactionId xid = members[i].xid;
    6198               2 :         MultiXactStatus mstatus = members[i].status;
    6199                 : 
    6200               2 :         Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
    6201                 : 
    6202               2 :         if (!ISUPDATE_from_mxstatus(mstatus))
    6203                 :         {
    6204                 :             /*
    6205                 :              * Locker XID (not updater XID).  We only keep lockers that are
    6206                 :              * still running.
    6207 EUB             :              */
    6208 GBC           4 :             if (TransactionIdIsCurrentTransactionId(xid) ||
    6209 GIC           2 :                 TransactionIdIsInProgress(xid))
    6210                 :             {
    6211 GNC           1 :                 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
    6212 UNC           0 :                     ereport(ERROR,
    6213                 :                             (errcode(ERRCODE_DATA_CORRUPTED),
    6214                 :                              errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
    6215                 :                                              multi, xid,
    6216                 :                                              cutoffs->OldestXmin)));
    6217 CBC           1 :                 newmembers[nnewmembers++] = members[i];
    6218 GIC           1 :                 has_lockers = true;
    6219                 :             }
    6220                 : 
    6221 GNC           2 :             continue;
    6222 ECB             :         }
    6223                 : 
    6224                 :         /*
    6225                 :          * Updater XID (not locker XID).  Should we keep it?
    6226                 :          *
    6227                 :          * Since the tuple wasn't totally removed when vacuum pruned, the
    6228                 :          * update Xid cannot possibly be older than OldestXmin cutoff unless
    6229                 :          * the updater XID aborted.  If the updater transaction is known
    6230                 :          * aborted or crashed then it's okay to ignore it, otherwise not.
    6231                 :          *
    6232                 :          * In any case the Multi should never contain two updaters, whatever
    6233                 :          * their individual commit status.  Check for that first, in passing.
    6234                 :          */
    6235 UNC           0 :         if (TransactionIdIsValid(update_xid))
    6236               0 :             ereport(ERROR,
    6237                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6238                 :                      errmsg_internal("multixact %u has two or more updating members",
    6239                 :                                      multi),
    6240                 :                      errdetail_internal("First updater XID=%u second updater XID=%u.",
    6241                 :                                         update_xid, xid)));
    6242                 : 
    6243                 :         /*
    6244                 :          * As with all tuple visibility routines, it's critical to test
    6245                 :          * TransactionIdIsInProgress before TransactionIdDidCommit, because of
    6246                 :          * race conditions explained in detail in heapam_visibility.c.
    6247                 :          */
    6248               0 :         if (TransactionIdIsCurrentTransactionId(xid) ||
    6249               0 :             TransactionIdIsInProgress(xid))
    6250               0 :             update_xid = xid;
    6251               0 :         else if (TransactionIdDidCommit(xid))
    6252                 :         {
    6253                 :             /*
    6254                 :              * The transaction committed, so we can tell caller to set
    6255                 :              * HEAP_XMAX_COMMITTED.  (We can only do this because we know the
    6256                 :              * transaction is not running.)
    6257                 :              */
    6258               0 :             update_committed = true;
    6259               0 :             update_xid = xid;
    6260                 :         }
    6261                 :         else
    6262                 :         {
    6263                 :             /*
    6264                 :              * Not in progress, not committed -- must be aborted or crashed;
    6265                 :              * we can ignore it.
    6266                 :              */
    6267               0 :             continue;
    6268                 :         }
    6269                 : 
    6270                 :         /*
    6271                 :          * We determined that updater must be kept -- add it to pending new
    6272                 :          * members list
    6273                 :          */
    6274               0 :         if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
    6275               0 :             ereport(ERROR,
    6276                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6277                 :                      errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
    6278                 :                                      multi, xid, cutoffs->OldestXmin)));
    6279               0 :         newmembers[nnewmembers++] = members[i];
    6280 ECB             :     }
    6281                 : 
    6282 CBC           1 :     pfree(members);
    6283 ECB             : 
    6284                 :     /*
    6285                 :      * Determine what to do with caller's multi based on information gathered
    6286                 :      * during our second pass
    6287                 :      */
    6288 GIC           1 :     if (nnewmembers == 0)
    6289                 :     {
    6290                 :         /* Nothing worth keeping */
    6291 LBC           0 :         *flags |= FRM_INVALIDATE_XMAX;
    6292 UNC           0 :         newxmax = InvalidTransactionId;
    6293                 :     }
    6294 CBC           1 :     else if (TransactionIdIsValid(update_xid) && !has_lockers)
    6295 EUB             :     {
    6296                 :         /*
    6297                 :          * If there's a single member and it's an update, pass it back alone
    6298                 :          * without creating a new Multi.  (XXX we could do this when there's a
    6299                 :          * single remaining locker, too, but that would complicate the API too
    6300                 :          * much; moreover, the case with the single updater is more
    6301 ECB             :          * interesting, because those are longer-lived.)
    6302                 :          */
    6303 UIC           0 :         Assert(nnewmembers == 1);
    6304 LBC           0 :         *flags |= FRM_RETURN_IS_XID;
    6305               0 :         if (update_committed)
    6306 UIC           0 :             *flags |= FRM_MARK_COMMITTED;
    6307 UNC           0 :         newxmax = update_xid;
    6308                 :     }
    6309                 :     else
    6310                 :     {
    6311 ECB             :         /*
    6312                 :          * Create a new multixact with the surviving members of the previous
    6313                 :          * one, to set as new Xmax in the tuple
    6314 EUB             :          */
    6315 GNC           1 :         newxmax = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
    6316 GIC           1 :         *flags |= FRM_RETURN_IS_MULTI;
    6317                 :     }
    6318                 : 
    6319 GBC           1 :     pfree(newmembers);
    6320                 : 
    6321 GNC           1 :     pagefrz->freeze_required = true;
    6322               1 :     return newxmax;
    6323                 : }
    6324                 : 
    6325                 : /*
    6326 ECB             :  * heap_prepare_freeze_tuple
    6327                 :  *
    6328                 :  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
    6329                 :  * are older than the OldestXmin and/or OldestMxact freeze cutoffs.  If so,
    6330                 :  * setup enough state (in the *frz output argument) to enable caller to
    6331                 :  * process this tuple as part of freezing its page, and return true.  Return
    6332                 :  * false if nothing can be changed about the tuple right now.
    6333                 :  *
    6334                 :  * Also sets *totally_frozen to true if the tuple will be totally frozen once
    6335                 :  * caller executes returned freeze plan (or if the tuple was already totally
    6336                 :  * frozen by an earlier VACUUM).  This indicates that there are no remaining
    6337                 :  * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
    6338                 :  *
    6339                 :  * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
    6340                 :  * tuple that we returned true for, and call heap_freeze_execute_prepared to
    6341                 :  * execute freezing.  Caller must initialize pagefrz fields for page as a
    6342                 :  * whole before first call here for each heap page.
    6343                 :  *
    6344                 :  * VACUUM caller decides on whether or not to freeze the page as a whole.
    6345                 :  * We'll often prepare freeze plans for a page that caller just discards.
    6346                 :  * However, VACUUM doesn't always get to make a choice; it must freeze when
    6347                 :  * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
    6348                 :  * MXIDs < MultiXactCutoff) can never be left behind.  We help to make sure
    6349                 :  * that VACUUM always follows that rule.
    6350                 :  *
    6351                 :  * We sometimes force freezing of xmax MultiXactId values long before it is
    6352                 :  * strictly necessary to do so just to ensure the FreezeLimit postcondition.
    6353                 :  * It's worth processing MultiXactIds proactively when it is cheap to do so,
    6354                 :  * and it's convenient to make that happen by piggy-backing it on the "force
    6355                 :  * freezing" mechanism.  Conversely, we sometimes delay freezing MultiXactIds
    6356                 :  * because it is expensive right now (though only when it's still possible to
    6357                 :  * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
    6358                 :  *
    6359                 :  * It is assumed that the caller has checked the tuple with
    6360                 :  * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
    6361                 :  * (else we should be removing the tuple, not freezing it).
    6362                 :  *
    6363                 :  * NB: This function has side effects: it might allocate a new MultiXactId.
    6364                 :  * It will be set as tuple's new xmax when our *frz output is processed within
    6365                 :  * heap_execute_freeze_tuple later on.  If the tuple is in a shared buffer
    6366                 :  * then caller had better have an exclusive lock on it already.
    6367                 :  */
    6368                 : bool
    6369 GIC     9525743 : heap_prepare_freeze_tuple(HeapTupleHeader tuple,
    6370                 :                           const struct VacuumCutoffs *cutoffs,
    6371                 :                           HeapPageFreeze *pagefrz,
    6372                 :                           HeapTupleFreeze *frz, bool *totally_frozen)
    6373                 : {
    6374 GNC     9525743 :     bool        xmin_already_frozen = false,
    6375         9525743 :                 xmax_already_frozen = false;
    6376         9525743 :     bool        freeze_xmin = false,
    6377         9525743 :                 replace_xvac = false,
    6378         9525743 :                 replace_xmax = false,
    6379         9525743 :                 freeze_xmax = false;
    6380                 :     TransactionId xid;
    6381                 : 
    6382         9525743 :     frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
    6383 GIC     9525743 :     frz->t_infomask2 = tuple->t_infomask2;
    6384         9525743 :     frz->t_infomask = tuple->t_infomask;
    6385 GNC     9525743 :     frz->frzflags = 0;
    6386         9525743 :     frz->checkflags = 0;
    6387 EUB             : 
    6388                 :     /*
    6389                 :      * Process xmin, while keeping track of whether it's already frozen, or
    6390                 :      * will become frozen iff our freeze plan is executed by caller (could be
    6391                 :      * neither).
    6392                 :      */
    6393 GIC     9525743 :     xid = HeapTupleHeaderGetXmin(tuple);
    6394         9525743 :     if (!TransactionIdIsNormal(xid))
    6395 GNC     4010086 :         xmin_already_frozen = true;
    6396 ECB             :     else
    6397                 :     {
    6398 GNC     5515657 :         if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
    6399 UIC           0 :             ereport(ERROR,
    6400                 :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6401                 :                      errmsg_internal("found xmin %u from before relfrozenxid %u",
    6402                 :                                      xid, cutoffs->relfrozenxid)));
    6403                 : 
    6404                 :         /* Will set freeze_xmin flags in freeze plan below */
    6405 GNC     5515657 :         freeze_xmin = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
    6406                 : 
    6407                 :         /* Verify that xmin committed if and when freeze plan is executed */
    6408         5515657 :         if (freeze_xmin)
    6409         4722584 :             frz->checkflags |= HEAP_FREEZE_CHECK_XMIN_COMMITTED;
    6410 ECB             :     }
    6411                 : 
    6412                 :     /*
    6413                 :      * Old-style VACUUM FULL is gone, but we have to process xvac for as long
    6414                 :      * as we support having MOVED_OFF/MOVED_IN tuples in the database
    6415 EUB             :      */
    6416 GNC     9525743 :     xid = HeapTupleHeaderGetXvac(tuple);
    6417         9525743 :     if (TransactionIdIsNormal(xid))
    6418                 :     {
    6419 UNC           0 :         Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
    6420               0 :         Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
    6421                 : 
    6422                 :         /*
    6423                 :          * For Xvac, we always freeze proactively.  This allows totally_frozen
    6424                 :          * tracking to ignore xvac.
    6425                 :          */
    6426               0 :         replace_xvac = pagefrz->freeze_required = true;
    6427                 : 
    6428                 :         /* Will set replace_xvac flags in freeze plan below */
    6429                 :     }
    6430                 : 
    6431                 :     /* Now process xmax */
    6432 GNC     9525743 :     xid = frz->xmax;
    6433 GIC     9525743 :     if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
    6434                 :     {
    6435                 :         /* Raw xmax is a MultiXactId */
    6436                 :         TransactionId newxmax;
    6437 ECB             :         uint16      flags;
    6438                 : 
    6439                 :         /*
    6440                 :          * We will either remove xmax completely (in the "freeze_xmax" path),
    6441                 :          * process xmax by replacing it (in the "replace_xmax" path), or
    6442                 :          * perform no-op xmax processing.  The only constraint is that the
    6443                 :          * FreezeLimit/MultiXactCutoff postcondition must never be violated.
    6444                 :          */
    6445 GNC           6 :         newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
    6446                 :                                     &flags, pagefrz);
    6447 ECB             : 
    6448 GNC           6 :         if (flags & FRM_NOOP)
    6449                 :         {
    6450                 :             /*
    6451                 :              * xmax is a MultiXactId, and nothing about it changes for now.
    6452                 :              * This is the only case where 'freeze_required' won't have been
    6453                 :              * set for us by FreezeMultiXactId, as well as the only case where
    6454                 :              * neither freeze_xmax nor replace_xmax are set (given a multi).
    6455                 :              *
    6456                 :              * This is a no-op, but the call to FreezeMultiXactId might have
    6457                 :              * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
    6458                 :              * for us (the "freeze page" variants, specifically).  That'll
    6459                 :              * make it safe for our caller to freeze the page later on, while
    6460                 :              * leaving this particular xmax undisturbed.
    6461                 :              *
    6462                 :              * FreezeMultiXactId is _not_ responsible for the "no freeze"
    6463                 :              * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
    6464                 :              * job.  A call to heap_tuple_should_freeze for this same tuple
    6465                 :              * will take place below if 'freeze_required' isn't set already.
    6466                 :              * (This repeats work from FreezeMultiXactId, but allows "no
    6467                 :              * freeze" tracker maintenance to happen in only one place.)
    6468                 :              */
    6469               1 :             Assert(!MultiXactIdPrecedes(newxmax, cutoffs->MultiXactCutoff));
    6470               1 :             Assert(MultiXactIdIsValid(newxmax) && xid == newxmax);
    6471                 :         }
    6472               5 :         else if (flags & FRM_RETURN_IS_XID)
    6473                 :         {
    6474                 :             /*
    6475 ECB             :              * xmax will become an updater Xid (original MultiXact's updater
    6476                 :              * member Xid will be carried forward as a simple Xid in Xmax).
    6477 EUB             :              */
    6478 UNC           0 :             Assert(!TransactionIdPrecedes(newxmax, cutoffs->OldestXmin));
    6479 ECB             : 
    6480                 :             /*
    6481                 :              * NB -- some of these transformations are only valid because we
    6482                 :              * know the return Xid is a tuple updater (i.e. not merely a
    6483                 :              * locker.) Also note that the only reason we don't explicitly
    6484                 :              * worry about HEAP_KEYS_UPDATED is because it lives in
    6485                 :              * t_infomask2 rather than t_infomask.
    6486                 :              */
    6487 UIC           0 :             frz->t_infomask &= ~HEAP_XMAX_BITS;
    6488               0 :             frz->xmax = newxmax;
    6489               0 :             if (flags & FRM_MARK_COMMITTED)
    6490               0 :                 frz->t_infomask |= HEAP_XMAX_COMMITTED;
    6491 UNC           0 :             replace_xmax = true;
    6492 EUB             :         }
    6493 GBC           5 :         else if (flags & FRM_RETURN_IS_MULTI)
    6494 EUB             :         {
    6495                 :             uint16      newbits;
    6496                 :             uint16      newbits2;
    6497                 : 
    6498 ECB             :             /*
    6499                 :              * xmax is an old MultiXactId that we have to replace with a new
    6500                 :              * MultiXactId, to carry forward two or more original member XIDs.
    6501                 :              */
    6502 GNC           1 :             Assert(!MultiXactIdPrecedes(newxmax, cutoffs->OldestMxact));
    6503                 : 
    6504                 :             /*
    6505                 :              * We can't use GetMultiXactIdHintBits directly on the new multi
    6506                 :              * here; that routine initializes the masks to all zeroes, which
    6507                 :              * would lose other bits we need.  Doing it this way ensures all
    6508                 :              * unrelated bits remain untouched.
    6509 ECB             :              */
    6510 CBC           1 :             frz->t_infomask &= ~HEAP_XMAX_BITS;
    6511               1 :             frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    6512               1 :             GetMultiXactIdHintBits(newxmax, &newbits, &newbits2);
    6513 GIC           1 :             frz->t_infomask |= newbits;
    6514               1 :             frz->t_infomask2 |= newbits2;
    6515               1 :             frz->xmax = newxmax;
    6516 GNC           1 :             replace_xmax = true;
    6517                 :         }
    6518                 :         else
    6519 ECB             :         {
    6520                 :             /*
    6521                 :              * Freeze plan for tuple "freezes xmax" in the strictest sense:
    6522                 :              * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
    6523                 :              */
    6524 GNC           4 :             Assert(flags & FRM_INVALIDATE_XMAX);
    6525 GIC           4 :             Assert(!TransactionIdIsValid(newxmax));
    6526                 : 
    6527                 :             /* Will set freeze_xmax flags in freeze plan below */
    6528 GNC           4 :             freeze_xmax = true;
    6529                 :         }
    6530                 : 
    6531                 :         /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
    6532               6 :         Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
    6533                 :     }
    6534 GIC     9525737 :     else if (TransactionIdIsNormal(xid))
    6535                 :     {
    6536                 :         /* Raw xmax is normal XID */
    6537 GNC      353374 :         if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
    6538 UIC           0 :             ereport(ERROR,
    6539 ECB             :                     (errcode(ERRCODE_DATA_CORRUPTED),
    6540                 :                      errmsg_internal("found xmax %u from before relfrozenxid %u",
    6541                 :                                      xid, cutoffs->relfrozenxid)));
    6542                 : 
    6543                 :         /* Will set freeze_xmax flags in freeze plan below */
    6544 GNC      353374 :         freeze_xmax = TransactionIdPrecedes(xid, cutoffs->OldestXmin);
    6545                 : 
    6546                 :         /*
    6547                 :          * Verify that xmax aborted if and when freeze plan is executed,
    6548                 :          * provided it's from an update. (A lock-only xmax can be removed
    6549                 :          * independent of this, since the lock is released at xact end.)
    6550                 :          */
    6551          353374 :         if (freeze_xmax && !HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask))
    6552             219 :             frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
    6553                 :     }
    6554         9172363 :     else if (!TransactionIdIsValid(xid))
    6555                 :     {
    6556                 :         /* Raw xmax is InvalidTransactionId XID */
    6557         9172363 :         Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
    6558 GIC     9172363 :         xmax_already_frozen = true;
    6559                 :     }
    6560 ECB             :     else
    6561 UIC           0 :         ereport(ERROR,
    6562                 :                 (errcode(ERRCODE_DATA_CORRUPTED),
    6563                 :                  errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
    6564                 :                                  xid, tuple->t_infomask)));
    6565                 : 
    6566 GNC     9525743 :     if (freeze_xmin)
    6567                 :     {
    6568         4722584 :         Assert(!xmin_already_frozen);
    6569                 : 
    6570         4722584 :         frz->t_infomask |= HEAP_XMIN_FROZEN;
    6571                 :     }
    6572         9525743 :     if (replace_xvac)
    6573                 :     {
    6574                 :         /*
    6575                 :          * If a MOVED_OFF tuple is not dead, the xvac transaction must have
    6576                 :          * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
    6577                 :          * transaction succeeded.
    6578                 :          */
    6579 UNC           0 :         Assert(pagefrz->freeze_required);
    6580               0 :         if (tuple->t_infomask & HEAP_MOVED_OFF)
    6581               0 :             frz->frzflags |= XLH_INVALID_XVAC;
    6582                 :         else
    6583               0 :             frz->frzflags |= XLH_FREEZE_XVAC;
    6584                 :     }
    6585 GNC     9525743 :     if (replace_xmax)
    6586                 :     {
    6587               1 :         Assert(!xmax_already_frozen && !freeze_xmax);
    6588               1 :         Assert(pagefrz->freeze_required);
    6589                 : 
    6590                 :         /* Already set replace_xmax flags in freeze plan earlier */
    6591                 :     }
    6592 GIC     9525743 :     if (freeze_xmax)
    6593                 :     {
    6594 GNC        1058 :         Assert(!xmax_already_frozen && !replace_xmax);
    6595 ECB             : 
    6596 GIC        1058 :         frz->xmax = InvalidTransactionId;
    6597 ECB             : 
    6598                 :         /*
    6599                 :          * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
    6600                 :          * LOCKED.  Normalize to INVALID just to be sure no one gets confused.
    6601                 :          * Also get rid of the HEAP_KEYS_UPDATED bit.
    6602                 :          */
    6603 GIC        1058 :         frz->t_infomask &= ~HEAP_XMAX_BITS;
    6604 CBC        1058 :         frz->t_infomask |= HEAP_XMAX_INVALID;
    6605 GIC        1058 :         frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
    6606 CBC        1058 :         frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    6607 ECB             :     }
    6608                 : 
    6609 EUB             :     /*
    6610                 :      * Determine if this tuple is already totally frozen, or will become
    6611                 :      * totally frozen (provided caller executes freeze plans for the page)
    6612                 :      */
    6613 GNC    18257355 :     *totally_frozen = ((freeze_xmin || xmin_already_frozen) &&
    6614         8731612 :                        (freeze_xmax || xmax_already_frozen));
    6615                 : 
    6616         9525743 :     if (!pagefrz->freeze_required && !(xmin_already_frozen &&
    6617                 :                                        xmax_already_frozen))
    6618                 :     {
    6619                 :         /*
    6620                 :          * So far no previous tuple from the page made freezing mandatory.
    6621                 :          * Does this tuple force caller to freeze the entire page?
    6622 EUB             :          */
    6623 GNC     2207959 :         pagefrz->freeze_required =
    6624         2207959 :             heap_tuple_should_freeze(tuple, cutoffs,
    6625                 :                                      &pagefrz->NoFreezePageRelfrozenXid,
    6626                 :                                      &pagefrz->NoFreezePageRelminMxid);
    6627                 :     }
    6628 ECB             : 
    6629                 :     /* Tell caller if this tuple has a usable freeze plan set in *frz */
    6630 GNC     9525743 :     return freeze_xmin || replace_xvac || replace_xmax || freeze_xmax;
    6631                 : }
    6632                 : 
    6633                 : /*
    6634                 :  * heap_execute_freeze_tuple
    6635                 :  *      Execute the prepared freezing of a tuple with caller's freeze plan.
    6636 ECB             :  *
    6637                 :  * Caller is responsible for ensuring that no other backend can access the
    6638                 :  * storage underlying this tuple, either by holding an exclusive lock on the
    6639                 :  * buffer containing it (which is what lazy VACUUM does), or by having it be
    6640                 :  * in private storage (which is what CLUSTER and friends do).
    6641                 :  */
    6642                 : static inline void
    6643 GNC     3307177 : heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
    6644                 : {
    6645 CBC     3307177 :     HeapTupleHeaderSetXmax(tuple, frz->xmax);
    6646                 : 
    6647         3307177 :     if (frz->frzflags & XLH_FREEZE_XVAC)
    6648 UIC           0 :         HeapTupleHeaderSetXvac(tuple, FrozenTransactionId);
    6649                 : 
    6650 CBC     3307177 :     if (frz->frzflags & XLH_INVALID_XVAC)
    6651 LBC           0 :         HeapTupleHeaderSetXvac(tuple, InvalidTransactionId);
    6652                 : 
    6653 GIC     3307177 :     tuple->t_infomask = frz->t_infomask;
    6654         3307177 :     tuple->t_infomask2 = frz->t_infomask2;
    6655         3307177 : }
    6656                 : 
    6657                 : /*
    6658                 :  * heap_freeze_execute_prepared
    6659                 :  *
    6660                 :  * Executes freezing of one or more heap tuples on a page on behalf of caller.
    6661                 :  * Caller passes an array of tuple plans from heap_prepare_freeze_tuple.
    6662                 :  * Caller must set 'offset' in each plan for us.  Note that we destructively
    6663                 :  * sort caller's tuples array in-place, so caller had better be done with it.
    6664                 :  *
    6665                 :  * WAL-logs the changes so that VACUUM can advance the rel's relfrozenxid
    6666                 :  * later on without any risk of unsafe pg_xact lookups, even following a hard
    6667                 :  * crash (or when querying from a standby).  We represent freezing by setting
    6668                 :  * infomask bits in tuple headers, but this shouldn't be thought of as a hint.
    6669                 :  * See section on buffer access rules in src/backend/storage/buffer/README.
    6670                 :  */
    6671                 : void
    6672 GNC       80091 : heap_freeze_execute_prepared(Relation rel, Buffer buffer,
    6673                 :                              TransactionId snapshotConflictHorizon,
    6674                 :                              HeapTupleFreeze *tuples, int ntuples)
    6675                 : {
    6676           80091 :     Page        page = BufferGetPage(buffer);
    6677                 : 
    6678           80091 :     Assert(ntuples > 0);
    6679                 : 
    6680                 :     /*
    6681                 :      * Perform xmin/xmax XID status sanity checks before critical section.
    6682                 :      *
    6683                 :      * heap_prepare_freeze_tuple doesn't perform these checks directly because
    6684                 :      * pg_xact lookups are relatively expensive.  They shouldn't be repeated
    6685                 :      * by successive VACUUMs that each decide against freezing the same page.
    6686                 :      */
    6687         3137114 :     for (int i = 0; i < ntuples; i++)
    6688                 :     {
    6689         3057023 :         HeapTupleFreeze *frz = tuples + i;
    6690         3057023 :         ItemId      itemid = PageGetItemId(page, frz->offset);
    6691                 :         HeapTupleHeader htup;
    6692                 : 
    6693         3057023 :         htup = (HeapTupleHeader) PageGetItem(page, itemid);
    6694                 : 
    6695                 :         /* Deliberately avoid relying on tuple hint bits here */
    6696         3057023 :         if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
    6697                 :         {
    6698         3057022 :             TransactionId xmin = HeapTupleHeaderGetRawXmin(htup);
    6699                 : 
    6700         3057022 :             Assert(!HeapTupleHeaderXminFrozen(htup));
    6701         3057022 :             if (unlikely(!TransactionIdDidCommit(xmin)))
    6702 UNC           0 :                 ereport(ERROR,
    6703                 :                         (errcode(ERRCODE_DATA_CORRUPTED),
    6704                 :                          errmsg_internal("uncommitted xmin %u needs to be frozen",
    6705                 :                                          xmin)));
    6706                 :         }
    6707                 : 
    6708                 :         /*
    6709                 :          * TransactionIdDidAbort won't work reliably in the presence of XIDs
    6710                 :          * left behind by transactions that were in progress during a crash,
    6711                 :          * so we can only check that xmax didn't commit
    6712                 :          */
    6713 GNC     3057023 :         if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
    6714                 :         {
    6715              16 :             TransactionId xmax = HeapTupleHeaderGetRawXmax(htup);
    6716                 : 
    6717              16 :             Assert(TransactionIdIsNormal(xmax));
    6718              16 :             if (unlikely(TransactionIdDidCommit(xmax)))
    6719 UNC           0 :                 ereport(ERROR,
    6720                 :                         (errcode(ERRCODE_DATA_CORRUPTED),
    6721                 :                          errmsg_internal("cannot freeze committed xmax %u",
    6722                 :                                          xmax)));
    6723                 :         }
    6724                 :     }
    6725                 : 
    6726 GNC       80091 :     START_CRIT_SECTION();
    6727                 : 
    6728         3137114 :     for (int i = 0; i < ntuples; i++)
    6729                 :     {
    6730         3057023 :         HeapTupleFreeze *frz = tuples + i;
    6731         3057023 :         ItemId      itemid = PageGetItemId(page, frz->offset);
    6732                 :         HeapTupleHeader htup;
    6733                 : 
    6734         3057023 :         htup = (HeapTupleHeader) PageGetItem(page, itemid);
    6735         3057023 :         heap_execute_freeze_tuple(htup, frz);
    6736                 :     }
    6737                 : 
    6738           80091 :     MarkBufferDirty(buffer);
    6739                 : 
    6740                 :     /* Now WAL-log freezing if necessary */
    6741           80091 :     if (RelationNeedsWAL(rel))
    6742                 :     {
    6743                 :         xl_heap_freeze_plan plans[MaxHeapTuplesPerPage];
    6744                 :         OffsetNumber offsets[MaxHeapTuplesPerPage];
    6745                 :         int         nplans;
    6746                 :         xl_heap_freeze_page xlrec;
    6747                 :         XLogRecPtr  recptr;
    6748                 : 
    6749                 :         /* Prepare deduplicated representation for use in WAL record */
    6750           80089 :         nplans = heap_log_freeze_plan(tuples, ntuples, plans, offsets);
    6751                 : 
    6752           80089 :         xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
    6753           80089 :         xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel);
    6754           80089 :         xlrec.nplans = nplans;
    6755                 : 
    6756           80089 :         XLogBeginInsert();
    6757           80089 :         XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage);
    6758                 : 
    6759                 :         /*
    6760                 :          * The freeze plan array and offset array are not actually in the
    6761                 :          * buffer, but pretend that they are.  When XLogInsert stores the
    6762                 :          * whole buffer, the arrays need not be stored too.
    6763                 :          */
    6764           80089 :         XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
    6765           80089 :         XLogRegisterBufData(0, (char *) plans,
    6766                 :                             nplans * sizeof(xl_heap_freeze_plan));
    6767           80089 :         XLogRegisterBufData(0, (char *) offsets,
    6768                 :                             ntuples * sizeof(OffsetNumber));
    6769                 : 
    6770           80089 :         recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE);
    6771                 : 
    6772           80089 :         PageSetLSN(page, recptr);
    6773                 :     }
    6774                 : 
    6775           80091 :     END_CRIT_SECTION();
    6776           80091 : }
    6777                 : 
    6778                 : /*
    6779                 :  * Comparator used to deduplicate XLOG_HEAP2_FREEZE_PAGE freeze plans
    6780                 :  */
    6781                 : static int
    6782         3873335 : heap_log_freeze_cmp(const void *arg1, const void *arg2)
    6783                 : {
    6784         3873335 :     HeapTupleFreeze *frz1 = (HeapTupleFreeze *) arg1;
    6785         3873335 :     HeapTupleFreeze *frz2 = (HeapTupleFreeze *) arg2;
    6786                 : 
    6787         3873335 :     if (frz1->xmax < frz2->xmax)
    6788               3 :         return -1;
    6789         3873332 :     else if (frz1->xmax > frz2->xmax)
    6790              13 :         return 1;
    6791                 : 
    6792         3873319 :     if (frz1->t_infomask2 < frz2->t_infomask2)
    6793            8108 :         return -1;
    6794         3865211 :     else if (frz1->t_infomask2 > frz2->t_infomask2)
    6795           22015 :         return 1;
    6796                 : 
    6797         3843196 :     if (frz1->t_infomask < frz2->t_infomask)
    6798           51398 :         return -1;
    6799         3791798 :     else if (frz1->t_infomask > frz2->t_infomask)
    6800           86119 :         return 1;
    6801                 : 
    6802         3705679 :     if (frz1->frzflags < frz2->frzflags)
    6803 UNC           0 :         return -1;
    6804 GNC     3705679 :     else if (frz1->frzflags > frz2->frzflags)
    6805 UNC           0 :         return 1;
    6806                 : 
    6807                 :     /*
    6808                 :      * heap_log_freeze_eq would consider these tuple-wise plans to be equal.
    6809                 :      * (So the tuples will share a single canonical freeze plan.)
    6810                 :      *
    6811                 :      * We tiebreak on page offset number to keep each freeze plan's page
    6812                 :      * offset number array individually sorted. (Unnecessary, but be tidy.)
    6813                 :      */
    6814 GNC     3705679 :     if (frz1->offset < frz2->offset)
    6815         3313703 :         return -1;
    6816          391976 :     else if (frz1->offset > frz2->offset)
    6817          391976 :         return 1;
    6818                 : 
    6819 UNC           0 :     Assert(false);
    6820                 :     return 0;
    6821                 : }
    6822                 : 
    6823                 : /*
    6824                 :  * Compare fields that describe actions required to freeze tuple with caller's
    6825                 :  * open plan.  If everything matches then the frz tuple plan is equivalent to
    6826                 :  * caller's plan.
    6827                 :  */
    6828                 : static inline bool
    6829 GNC     2976932 : heap_log_freeze_eq(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
    6830                 : {
    6831         2976932 :     if (plan->xmax == frz->xmax &&
    6832         2976927 :         plan->t_infomask2 == frz->t_infomask2 &&
    6833         2973857 :         plan->t_infomask == frz->t_infomask &&
    6834         2961624 :         plan->frzflags == frz->frzflags)
    6835         2961624 :         return true;
    6836                 : 
    6837                 :     /* Caller must call heap_log_freeze_new_plan again for frz */
    6838           15308 :     return false;
    6839                 : }
    6840                 : 
    6841                 : /*
    6842                 :  * Start new plan initialized using tuple-level actions.  At least one tuple
    6843                 :  * will have steps required to freeze described by caller's plan during REDO.
    6844                 :  */
    6845                 : static inline void
    6846           95397 : heap_log_freeze_new_plan(xl_heap_freeze_plan *plan, HeapTupleFreeze *frz)
    6847                 : {
    6848           95397 :     plan->xmax = frz->xmax;
    6849           95397 :     plan->t_infomask2 = frz->t_infomask2;
    6850           95397 :     plan->t_infomask = frz->t_infomask;
    6851           95397 :     plan->frzflags = frz->frzflags;
    6852           95397 :     plan->ntuples = 1;           /* for now */
    6853           95397 : }
    6854                 : 
    6855                 : /*
    6856                 :  * Deduplicate tuple-based freeze plans so that each distinct set of
    6857                 :  * processing steps is only stored once in XLOG_HEAP2_FREEZE_PAGE records.
    6858                 :  * Called during original execution of freezing (for logged relations).
    6859                 :  *
    6860                 :  * Return value is number of plans set in *plans_out for caller.  Also writes
    6861                 :  * an array of offset numbers into *offsets_out output argument for caller
    6862                 :  * (actually there is one array per freeze plan, but that's not of immediate
    6863                 :  * concern to our caller).
    6864                 :  */
    6865                 : static int
    6866           80089 : heap_log_freeze_plan(HeapTupleFreeze *tuples, int ntuples,
    6867                 :                      xl_heap_freeze_plan *plans_out,
    6868                 :                      OffsetNumber *offsets_out)
    6869                 : {
    6870           80089 :     int         nplans = 0;
    6871                 : 
    6872                 :     /* Sort tuple-based freeze plans in the order required to deduplicate */
    6873           80089 :     qsort(tuples, ntuples, sizeof(HeapTupleFreeze), heap_log_freeze_cmp);
    6874                 : 
    6875         3137110 :     for (int i = 0; i < ntuples; i++)
    6876                 :     {
    6877         3057021 :         HeapTupleFreeze *frz = tuples + i;
    6878                 : 
    6879         3057021 :         if (i == 0)
    6880                 :         {
    6881                 :             /* New canonical freeze plan starting with first tup */
    6882           80089 :             heap_log_freeze_new_plan(plans_out, frz);
    6883           80089 :             nplans++;
    6884                 :         }
    6885         2976932 :         else if (heap_log_freeze_eq(plans_out, frz))
    6886                 :         {
    6887                 :             /* tup matches open canonical plan -- include tup in it */
    6888         2961624 :             Assert(offsets_out[i - 1] < frz->offset);
    6889         2961624 :             plans_out->ntuples++;
    6890                 :         }
    6891                 :         else
    6892                 :         {
    6893                 :             /* Tup doesn't match current plan -- done with it now */
    6894           15308 :             plans_out++;
    6895                 : 
    6896                 :             /* New canonical freeze plan starting with this tup */
    6897           15308 :             heap_log_freeze_new_plan(plans_out, frz);
    6898           15308 :             nplans++;
    6899                 :         }
    6900                 : 
    6901                 :         /*
    6902                 :          * Save page offset number in dedicated buffer in passing.
    6903                 :          *
    6904                 :          * REDO routine relies on the record's offset numbers array grouping
    6905                 :          * offset numbers by freeze plan.  The sort order within each grouping
    6906                 :          * is ascending offset number order, just to keep things tidy.
    6907                 :          */
    6908         3057021 :         offsets_out[i] = frz->offset;
    6909                 :     }
    6910                 : 
    6911           80089 :     Assert(nplans > 0 && nplans <= ntuples);
    6912                 : 
    6913           80089 :     return nplans;
    6914                 : }
    6915                 : 
    6916 ECB             : /*
    6917                 :  * heap_freeze_tuple
    6918                 :  *      Freeze tuple in place, without WAL logging.
    6919                 :  *
    6920                 :  * Useful for callers like CLUSTER that perform their own WAL logging.
    6921                 :  */
    6922                 : bool
    6923 CBC      384627 : heap_freeze_tuple(HeapTupleHeader tuple,
    6924 ECB             :                   TransactionId relfrozenxid, TransactionId relminmxid,
    6925                 :                   TransactionId FreezeLimit, TransactionId MultiXactCutoff)
    6926                 : {
    6927                 :     HeapTupleFreeze frz;
    6928                 :     bool        do_freeze;
    6929                 :     bool        totally_frozen;
    6930                 :     struct VacuumCutoffs cutoffs;
    6931                 :     HeapPageFreeze pagefrz;
    6932                 : 
    6933 GNC      384627 :     cutoffs.relfrozenxid = relfrozenxid;
    6934          384627 :     cutoffs.relminmxid = relminmxid;
    6935          384627 :     cutoffs.OldestXmin = FreezeLimit;
    6936          384627 :     cutoffs.OldestMxact = MultiXactCutoff;
    6937          384627 :     cutoffs.FreezeLimit = FreezeLimit;
    6938          384627 :     cutoffs.MultiXactCutoff = MultiXactCutoff;
    6939                 : 
    6940          384627 :     pagefrz.freeze_required = true;
    6941          384627 :     pagefrz.FreezePageRelfrozenXid = FreezeLimit;
    6942          384627 :     pagefrz.FreezePageRelminMxid = MultiXactCutoff;
    6943          384627 :     pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
    6944          384627 :     pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
    6945                 : 
    6946          384627 :     do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
    6947                 :                                           &pagefrz, &frz, &totally_frozen);
    6948 ECB             : 
    6949 EUB             :     /*
    6950                 :      * Note that because this is not a WAL-logged operation, we don't need to
    6951                 :      * fill in the offset in the freeze record.
    6952                 :      */
    6953                 : 
    6954 GIC      384627 :     if (do_freeze)
    6955          247765 :         heap_execute_freeze_tuple(tuple, &frz);
    6956          384627 :     return do_freeze;
    6957                 : }
    6958 ECB             : 
    6959                 : /*
    6960                 :  * For a given MultiXactId, return the hint bits that should be set in the
    6961                 :  * tuple's infomask.
    6962                 :  *
    6963 EUB             :  * Normally this should be called for a multixact that was just created, and
    6964                 :  * so is on our local cache, so the GetMembers call is fast.
    6965                 :  */
    6966                 : static void
    6967 GIC        1170 : GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
    6968                 :                        uint16 *new_infomask2)
    6969                 : {
    6970                 :     int         nmembers;
    6971                 :     MultiXactMember *members;
    6972                 :     int         i;
    6973 CBC        1170 :     uint16      bits = HEAP_XMAX_IS_MULTI;
    6974 GIC        1170 :     uint16      bits2 = 0;
    6975 CBC        1170 :     bool        has_update = false;
    6976            1170 :     LockTupleMode strongest = LockTupleKeyShare;
    6977 ECB             : 
    6978                 :     /*
    6979                 :      * We only use this in multis we just created, so they cannot be values
    6980                 :      * pre-pg_upgrade.
    6981                 :      */
    6982 CBC        1170 :     nmembers = GetMultiXactIdMembers(multi, &members, false, false);
    6983                 : 
    6984 GIC        3579 :     for (i = 0; i < nmembers; i++)
    6985                 :     {
    6986                 :         LockTupleMode mode;
    6987                 : 
    6988                 :         /*
    6989                 :          * Remember the strongest lock mode held by any member of the
    6990 ECB             :          * multixact.
    6991                 :          */
    6992 CBC        2409 :         mode = TUPLOCK_from_mxstatus(members[i].status);
    6993            2409 :         if (mode > strongest)
    6994             654 :             strongest = mode;
    6995 ECB             : 
    6996                 :         /* See what other bits we need */
    6997 CBC        2409 :         switch (members[i].status)
    6998                 :         {
    6999 GIC        2218 :             case MultiXactStatusForKeyShare:
    7000                 :             case MultiXactStatusForShare:
    7001                 :             case MultiXactStatusForNoKeyUpdate:
    7002            2218 :                 break;
    7003                 : 
    7004              52 :             case MultiXactStatusForUpdate:
    7005              52 :                 bits2 |= HEAP_KEYS_UPDATED;
    7006              52 :                 break;
    7007                 : 
    7008             129 :             case MultiXactStatusNoKeyUpdate:
    7009             129 :                 has_update = true;
    7010 CBC         129 :                 break;
    7011                 : 
    7012 GIC          10 :             case MultiXactStatusUpdate:
    7013              10 :                 bits2 |= HEAP_KEYS_UPDATED;
    7014 CBC          10 :                 has_update = true;
    7015 GIC          10 :                 break;
    7016                 :         }
    7017 ECB             :     }
    7018                 : 
    7019 CBC        1170 :     if (strongest == LockTupleExclusive ||
    7020                 :         strongest == LockTupleNoKeyExclusive)
    7021             216 :         bits |= HEAP_XMAX_EXCL_LOCK;
    7022 GIC         954 :     else if (strongest == LockTupleShare)
    7023 CBC         435 :         bits |= HEAP_XMAX_SHR_LOCK;
    7024 GIC         519 :     else if (strongest == LockTupleKeyShare)
    7025             519 :         bits |= HEAP_XMAX_KEYSHR_LOCK;
    7026 ECB             : 
    7027 CBC        1170 :     if (!has_update)
    7028 GIC        1031 :         bits |= HEAP_XMAX_LOCK_ONLY;
    7029 ECB             : 
    7030 GIC        1170 :     if (nmembers > 0)
    7031            1170 :         pfree(members);
    7032 ECB             : 
    7033 CBC        1170 :     *new_infomask = bits;
    7034 GIC        1170 :     *new_infomask2 = bits2;
    7035            1170 : }
    7036                 : 
    7037                 : /*
    7038 ECB             :  * MultiXactIdGetUpdateXid
    7039                 :  *
    7040                 :  * Given a multixact Xmax and corresponding infomask, which does not have the
    7041                 :  * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
    7042                 :  * transaction.
    7043                 :  *
    7044                 :  * Caller is expected to check the status of the updating transaction, if
    7045                 :  * necessary.
    7046                 :  */
    7047                 : static TransactionId
    7048 GIC         526 : MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
    7049                 : {
    7050             526 :     TransactionId update_xact = InvalidTransactionId;
    7051                 :     MultiXactMember *members;
    7052 ECB             :     int         nmembers;
    7053                 : 
    7054 GIC         526 :     Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
    7055 CBC         526 :     Assert(t_infomask & HEAP_XMAX_IS_MULTI);
    7056                 : 
    7057 ECB             :     /*
    7058                 :      * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
    7059                 :      * pre-pg_upgrade.
    7060                 :      */
    7061 GIC         526 :     nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
    7062                 : 
    7063             526 :     if (nmembers > 0)
    7064                 :     {
    7065                 :         int         i;
    7066                 : 
    7067 CBC        1978 :         for (i = 0; i < nmembers; i++)
    7068                 :         {
    7069                 :             /* Ignore lockers */
    7070 GIC        1452 :             if (!ISUPDATE_from_mxstatus(members[i].status))
    7071             926 :                 continue;
    7072                 : 
    7073                 :             /* there can be at most one updater */
    7074             526 :             Assert(update_xact == InvalidTransactionId);
    7075             526 :             update_xact = members[i].xid;
    7076                 : #ifndef USE_ASSERT_CHECKING
    7077 ECB             : 
    7078                 :             /*
    7079                 :              * in an assert-enabled build, walk the whole array to ensure
    7080                 :              * there's no other updater.
    7081                 :              */
    7082                 :             break;
    7083                 : #endif
    7084                 :         }
    7085                 : 
    7086 CBC         526 :         pfree(members);
    7087 ECB             :     }
    7088                 : 
    7089 GIC         526 :     return update_xact;
    7090 ECB             : }
    7091                 : 
    7092                 : /*
    7093                 :  * HeapTupleGetUpdateXid
    7094                 :  *      As above, but use a HeapTupleHeader
    7095                 :  *
    7096                 :  * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
    7097                 :  * checking the hint bits.
    7098                 :  */
    7099                 : TransactionId
    7100 CBC         518 : HeapTupleGetUpdateXid(HeapTupleHeader tuple)
    7101                 : {
    7102 GIC        1036 :     return MultiXactIdGetUpdateXid(HeapTupleHeaderGetRawXmax(tuple),
    7103             518 :                                    tuple->t_infomask);
    7104                 : }
    7105                 : 
    7106                 : /*
    7107                 :  * Does the given multixact conflict with the current transaction grabbing a
    7108                 :  * tuple lock of the given strength?
    7109                 :  *
    7110                 :  * The passed infomask pairs up with the given multixact in the tuple header.
    7111 ECB             :  *
    7112                 :  * If current_is_member is not NULL, it is set to 'true' if the current
    7113                 :  * transaction is a member of the given multixact.
    7114                 :  */
    7115                 : static bool
    7116 GIC          94 : DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
    7117 ECB             :                         LockTupleMode lockmode, bool *current_is_member)
    7118                 : {
    7119                 :     int         nmembers;
    7120                 :     MultiXactMember *members;
    7121 GIC          94 :     bool        result = false;
    7122              94 :     LOCKMODE    wanted = tupleLockExtraInfo[lockmode].hwlock;
    7123                 : 
    7124              94 :     if (HEAP_LOCKED_UPGRADED(infomask))
    7125 UIC           0 :         return false;
    7126 ECB             : 
    7127 GIC          94 :     nmembers = GetMultiXactIdMembers(multi, &members, false,
    7128 CBC          94 :                                      HEAP_XMAX_IS_LOCKED_ONLY(infomask));
    7129 GIC          94 :     if (nmembers >= 0)
    7130                 :     {
    7131                 :         int         i;
    7132                 : 
    7133             295 :         for (i = 0; i < nmembers; i++)
    7134                 :         {
    7135                 :             TransactionId memxid;
    7136 ECB             :             LOCKMODE    memlockmode;
    7137                 : 
    7138 CBC         207 :             if (result && (current_is_member == NULL || *current_is_member))
    7139                 :                 break;
    7140                 : 
    7141             201 :             memlockmode = LOCKMODE_from_mxstatus(members[i].status);
    7142                 : 
    7143 ECB             :             /* ignore members from current xact (but track their presence) */
    7144 GIC         201 :             memxid = members[i].xid;
    7145             201 :             if (TransactionIdIsCurrentTransactionId(memxid))
    7146 ECB             :             {
    7147 GIC          91 :                 if (current_is_member != NULL)
    7148 CBC          78 :                     *current_is_member = true;
    7149              91 :                 continue;
    7150 ECB             :             }
    7151 GIC         110 :             else if (result)
    7152 CBC           8 :                 continue;
    7153 ECB             : 
    7154                 :             /* ignore members that don't conflict with the lock we want */
    7155 GIC         102 :             if (!DoLockModesConflict(memlockmode, wanted))
    7156 CBC          67 :                 continue;
    7157 ECB             : 
    7158 CBC          35 :             if (ISUPDATE_from_mxstatus(members[i].status))
    7159 ECB             :             {
    7160                 :                 /* ignore aborted updaters */
    7161 GIC          17 :                 if (TransactionIdDidAbort(memxid))
    7162               1 :                     continue;
    7163 ECB             :             }
    7164                 :             else
    7165                 :             {
    7166                 :                 /* ignore lockers-only that are no longer in progress */
    7167 CBC          18 :                 if (!TransactionIdIsInProgress(memxid))
    7168               5 :                     continue;
    7169 ECB             :             }
    7170                 : 
    7171                 :             /*
    7172                 :              * Whatever remains are either live lockers that conflict with our
    7173                 :              * wanted lock, and updaters that are not aborted.  Those conflict
    7174                 :              * with what we want.  Set up to return true, but keep going to
    7175                 :              * look for the current transaction among the multixact members,
    7176                 :              * if needed.
    7177                 :              */
    7178 CBC          29 :             result = true;
    7179 ECB             :         }
    7180 GIC          94 :         pfree(members);
    7181                 :     }
    7182                 : 
    7183              94 :     return result;
    7184                 : }
    7185                 : 
    7186                 : /*
    7187                 :  * Do_MultiXactIdWait
    7188                 :  *      Actual implementation for the two functions below.
    7189                 :  *
    7190                 :  * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
    7191                 :  * needed to ensure we only sleep on conflicting members, and the infomask is
    7192 ECB             :  * used to optimize multixact access in case it's a lock-only multi); 'nowait'
    7193                 :  * indicates whether to use conditional lock acquisition, to allow callers to
    7194                 :  * fail if lock is unavailable.  'rel', 'ctid' and 'oper' are used to set up
    7195                 :  * context information for error messages.  'remaining', if not NULL, receives
    7196                 :  * the number of members that are still running, including any (non-aborted)
    7197                 :  * subtransactions of our own transaction.
    7198                 :  *
    7199                 :  * We do this by sleeping on each member using XactLockTableWait.  Any
    7200                 :  * members that belong to the current backend are *not* waited for, however;
    7201                 :  * this would not merely be useless but would lead to Assert failure inside
    7202                 :  * XactLockTableWait.  By the time this returns, it is certain that all
    7203                 :  * transactions *of other backends* that were members of the MultiXactId
    7204                 :  * that conflict with the requested status are dead (and no new ones can have
    7205                 :  * been added, since it is not legal to add members to an existing
    7206                 :  * MultiXactId).
    7207                 :  *
    7208                 :  * But by the time we finish sleeping, someone else may have changed the Xmax
    7209                 :  * of the containing tuple, so the caller needs to iterate on us somehow.
    7210                 :  *
    7211                 :  * Note that in case we return false, the number of remaining members is
    7212                 :  * not to be trusted.
    7213                 :  */
    7214                 : static bool
    7215 CBC          56 : Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
    7216                 :                    uint16 infomask, bool nowait,
    7217                 :                    Relation rel, ItemPointer ctid, XLTW_Oper oper,
    7218 ECB             :                    int *remaining)
    7219                 : {
    7220 GIC          56 :     bool        result = true;
    7221                 :     MultiXactMember *members;
    7222                 :     int         nmembers;
    7223              56 :     int         remain = 0;
    7224                 : 
    7225                 :     /* for pre-pg_upgrade tuples, no need to sleep at all */
    7226              56 :     nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
    7227              56 :         GetMultiXactIdMembers(multi, &members, false,
    7228              56 :                               HEAP_XMAX_IS_LOCKED_ONLY(infomask));
    7229                 : 
    7230 CBC          56 :     if (nmembers >= 0)
    7231                 :     {
    7232                 :         int         i;
    7233 ECB             : 
    7234 GIC         181 :         for (i = 0; i < nmembers; i++)
    7235                 :         {
    7236             129 :             TransactionId memxid = members[i].xid;
    7237             129 :             MultiXactStatus memstatus = members[i].status;
    7238                 : 
    7239             129 :             if (TransactionIdIsCurrentTransactionId(memxid))
    7240                 :             {
    7241              24 :                 remain++;
    7242              24 :                 continue;
    7243                 :             }
    7244 ECB             : 
    7245 GIC         105 :             if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus),
    7246 CBC         105 :                                      LOCKMODE_from_mxstatus(status)))
    7247 ECB             :             {
    7248 GIC          20 :                 if (remaining && TransactionIdIsInProgress(memxid))
    7249               6 :                     remain++;
    7250              20 :                 continue;
    7251                 :             }
    7252                 : 
    7253                 :             /*
    7254                 :              * This member conflicts with our multi, so we have to sleep (or
    7255                 :              * return failure, if asked to avoid waiting.)
    7256                 :              *
    7257                 :              * Note that we don't set up an error context callback ourselves,
    7258                 :              * but instead we pass the info down to XactLockTableWait.  This
    7259                 :              * might seem a bit wasteful because the context is set up and
    7260 ECB             :              * tore down for each member of the multixact, but in reality it
    7261                 :              * should be barely noticeable, and it avoids duplicate code.
    7262                 :              */
    7263 GIC          85 :             if (nowait)
    7264                 :             {
    7265 CBC           4 :                 result = ConditionalXactLockTableWait(memxid);
    7266               4 :                 if (!result)
    7267 GIC           4 :                     break;
    7268 ECB             :             }
    7269 EUB             :             else
    7270 GIC          81 :                 XactLockTableWait(memxid, rel, ctid, oper);
    7271 ECB             :         }
    7272                 : 
    7273 CBC          56 :         pfree(members);
    7274                 :     }
    7275                 : 
    7276 GIC          56 :     if (remaining)
    7277 CBC           8 :         *remaining = remain;
    7278                 : 
    7279 GIC          56 :     return result;
    7280                 : }
    7281                 : 
    7282 ECB             : /*
    7283                 :  * MultiXactIdWait
    7284                 :  *      Sleep on a MultiXactId.
    7285                 :  *
    7286                 :  * By the time we finish sleeping, someone else may have changed the Xmax
    7287                 :  * of the containing tuple, so the caller needs to iterate on us somehow.
    7288                 :  *
    7289                 :  * We return (in *remaining, if not NULL) the number of members that are still
    7290                 :  * running, including any (non-aborted) subtransactions of our own transaction.
    7291                 :  */
    7292                 : static void
    7293 CBC          52 : MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
    7294                 :                 Relation rel, ItemPointer ctid, XLTW_Oper oper,
    7295 ECB             :                 int *remaining)
    7296                 : {
    7297 GIC          52 :     (void) Do_MultiXactIdWait(multi, status, infomask, false,
    7298                 :                               rel, ctid, oper, remaining);
    7299 CBC          52 : }
    7300 ECB             : 
    7301                 : /*
    7302                 :  * ConditionalMultiXactIdWait
    7303                 :  *      As above, but only lock if we can get the lock without blocking.
    7304                 :  *
    7305                 :  * By the time we finish sleeping, someone else may have changed the Xmax
    7306                 :  * of the containing tuple, so the caller needs to iterate on us somehow.
    7307                 :  *
    7308                 :  * If the multixact is now all gone, return true.  Returns false if some
    7309                 :  * transactions might still be running.
    7310                 :  *
    7311                 :  * We return (in *remaining, if not NULL) the number of members that are still
    7312                 :  * running, including any (non-aborted) subtransactions of our own transaction.
    7313                 :  */
    7314                 : static bool
    7315 GIC           4 : ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
    7316                 :                            uint16 infomask, Relation rel, int *remaining)
    7317                 : {
    7318               4 :     return Do_MultiXactIdWait(multi, status, infomask, true,
    7319                 :                               rel, NULL, XLTW_None, remaining);
    7320                 : }
    7321                 : 
    7322 ECB             : /*
    7323                 :  * heap_tuple_needs_eventual_freeze
    7324                 :  *
    7325                 :  * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
    7326                 :  * will eventually require freezing (if tuple isn't removed by pruning first).
    7327                 :  */
    7328                 : bool
    7329 GIC     7027764 : heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
    7330                 : {
    7331                 :     TransactionId xid;
    7332                 : 
    7333                 :     /*
    7334                 :      * If xmin is a normal transaction ID, this tuple is definitely not
    7335                 :      * frozen.
    7336                 :      */
    7337         7027764 :     xid = HeapTupleHeaderGetXmin(tuple);
    7338         7027764 :     if (TransactionIdIsNormal(xid))
    7339           13376 :         return true;
    7340                 : 
    7341                 :     /*
    7342                 :      * If xmax is a valid xact or multixact, this tuple is also not frozen.
    7343                 :      */
    7344         7014388 :     if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
    7345                 :     {
    7346                 :         MultiXactId multi;
    7347                 : 
    7348               2 :         multi = HeapTupleHeaderGetRawXmax(tuple);
    7349               2 :         if (MultiXactIdIsValid(multi))
    7350               2 :             return true;
    7351                 :     }
    7352                 :     else
    7353                 :     {
    7354         7014386 :         xid = HeapTupleHeaderGetRawXmax(tuple);
    7355         7014386 :         if (TransactionIdIsNormal(xid))
    7356               4 :             return true;
    7357                 :     }
    7358                 : 
    7359 CBC     7014382 :     if (tuple->t_infomask & HEAP_MOVED)
    7360                 :     {
    7361 UIC           0 :         xid = HeapTupleHeaderGetXvac(tuple);
    7362               0 :         if (TransactionIdIsNormal(xid))
    7363               0 :             return true;
    7364 ECB             :     }
    7365                 : 
    7366 GIC     7014382 :     return false;
    7367 ECB             : }
    7368                 : 
    7369                 : /*
    7370                 :  * heap_tuple_should_freeze
    7371                 :  *
    7372                 :  * Return value indicates if heap_prepare_freeze_tuple sibling function would
    7373                 :  * (or should) force freezing of the heap page that contains caller's tuple.
    7374                 :  * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
    7375                 :  * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
    7376                 :  *
    7377                 :  * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
    7378                 :  * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
    7379                 :  * Our working assumption is that caller won't decide to freeze this tuple.
    7380                 :  * It's up to caller to only ratchet back its own top-level trackers after the
    7381                 :  * point that it fully commits to not freezing the tuple/page in question.
    7382                 :  */
    7383                 : bool
    7384 GNC     2208045 : heap_tuple_should_freeze(HeapTupleHeader tuple,
    7385                 :                          const struct VacuumCutoffs *cutoffs,
    7386                 :                          TransactionId *NoFreezePageRelfrozenXid,
    7387                 :                          MultiXactId *NoFreezePageRelminMxid)
    7388 ECB             : {
    7389                 :     TransactionId xid;
    7390                 :     MultiXactId multi;
    7391 GNC     2208045 :     bool        freeze = false;
    7392 ECB             : 
    7393                 :     /* First deal with xmin */
    7394 CBC     2208045 :     xid = HeapTupleHeaderGetXmin(tuple);
    7395         2208045 :     if (TransactionIdIsNormal(xid))
    7396 ECB             :     {
    7397 GNC     2207897 :         Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
    7398         2207897 :         if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
    7399           80949 :             *NoFreezePageRelfrozenXid = xid;
    7400         2207897 :         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
    7401           79971 :             freeze = true;
    7402                 :     }
    7403                 : 
    7404                 :     /* Now deal with xmax */
    7405 GIC     2208045 :     xid = InvalidTransactionId;
    7406         2208045 :     multi = InvalidMultiXactId;
    7407         2208045 :     if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
    7408               2 :         multi = HeapTupleHeaderGetRawXmax(tuple);
    7409                 :     else
    7410 CBC     2208043 :         xid = HeapTupleHeaderGetRawXmax(tuple);
    7411                 : 
    7412         2208045 :     if (TransactionIdIsNormal(xid))
    7413 ECB             :     {
    7414 GNC      320768 :         Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
    7415 ECB             :         /* xmax is a non-permanent XID */
    7416 GNC      320768 :         if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
    7417               4 :             *NoFreezePageRelfrozenXid = xid;
    7418          320768 :         if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
    7419               1 :             freeze = true;
    7420                 :     }
    7421 CBC     1887277 :     else if (!MultiXactIdIsValid(multi))
    7422                 :     {
    7423                 :         /* xmax is a permanent XID or invalid MultiXactId/XID */
    7424 ECB             :     }
    7425 CBC           2 :     else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
    7426                 :     {
    7427 ECB             :         /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
    7428 UNC           0 :         if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
    7429               0 :             *NoFreezePageRelminMxid = multi;
    7430                 :         /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
    7431               0 :         freeze = true;
    7432                 :     }
    7433                 :     else
    7434                 :     {
    7435                 :         /* xmax is a MultiXactId that may have an updater XID */
    7436                 :         MultiXactMember *members;
    7437                 :         int         nmembers;
    7438                 : 
    7439 GNC           2 :         Assert(MultiXactIdPrecedesOrEquals(cutoffs->relminmxid, multi));
    7440               2 :         if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
    7441               2 :             *NoFreezePageRelminMxid = multi;
    7442               2 :         if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
    7443               2 :             freeze = true;
    7444                 : 
    7445                 :         /* need to check whether any member of the mxact is old */
    7446 CBC           2 :         nmembers = GetMultiXactIdMembers(multi, &members, false,
    7447 GIC           2 :                                          HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask));
    7448 ECB             : 
    7449 GIC           5 :         for (int i = 0; i < nmembers; i++)
    7450                 :         {
    7451               3 :             xid = members[i].xid;
    7452 GNC           3 :             Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
    7453               3 :             if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
    7454 UNC           0 :                 *NoFreezePageRelfrozenXid = xid;
    7455 GNC           3 :             if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
    7456 UNC           0 :                 freeze = true;
    7457                 :         }
    7458 GIC           2 :         if (nmembers > 0)
    7459               1 :             pfree(members);
    7460                 :     }
    7461                 : 
    7462         2208045 :     if (tuple->t_infomask & HEAP_MOVED)
    7463                 :     {
    7464 LBC           0 :         xid = HeapTupleHeaderGetXvac(tuple);
    7465 UIC           0 :         if (TransactionIdIsNormal(xid))
    7466                 :         {
    7467 UNC           0 :             Assert(TransactionIdPrecedesOrEquals(cutoffs->relfrozenxid, xid));
    7468               0 :             if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
    7469               0 :                 *NoFreezePageRelfrozenXid = xid;
    7470                 :             /* heap_prepare_freeze_tuple forces xvac freezing */
    7471               0 :             freeze = true;
    7472                 :         }
    7473                 :     }
    7474                 : 
    7475 GNC     2208045 :     return freeze;
    7476                 : }
    7477                 : 
    7478                 : /*
    7479                 :  * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
    7480                 :  * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
    7481                 :  * that caller is in the process of physically removing, e.g. via HOT pruning
    7482                 :  * or index deletion.
    7483                 :  *
    7484                 :  * Caller must initialize its value to InvalidTransactionId, which is
    7485                 :  * generally interpreted as "definitely no need for a recovery conflict".
    7486                 :  * Final value must reflect all heap tuples that caller will physically remove
    7487                 :  * (or remove TID references to) via its ongoing pruning/deletion operation.
    7488                 :  * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
    7489                 :  * caller's WAL record) by REDO routine when it replays caller's operation.
    7490                 :  */
    7491                 : void
    7492         1575911 : HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple,
    7493                 :                                       TransactionId *snapshotConflictHorizon)
    7494 ECB             : {
    7495 CBC     1575911 :     TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
    7496 GIC     1575911 :     TransactionId xmax = HeapTupleHeaderGetUpdateXid(tuple);
    7497         1575911 :     TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
    7498                 : 
    7499         1575911 :     if (tuple->t_infomask & HEAP_MOVED)
    7500 ECB             :     {
    7501 UNC           0 :         if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
    7502               0 :             *snapshotConflictHorizon = xvac;
    7503                 :     }
    7504 ECB             : 
    7505                 :     /*
    7506                 :      * Ignore tuples inserted by an aborted transaction or if the tuple was
    7507                 :      * updated/deleted by the inserting transaction.
    7508                 :      *
    7509                 :      * Look for a committed hint bit, or if no xmin bit is set, check clog.
    7510                 :      */
    7511 CBC     1575911 :     if (HeapTupleHeaderXminCommitted(tuple) ||
    7512           52461 :         (!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
    7513                 :     {
    7514 GIC     2852914 :         if (xmax != xmin &&
    7515 GNC     1328708 :             TransactionIdFollows(xmax, *snapshotConflictHorizon))
    7516          165059 :             *snapshotConflictHorizon = xmax;
    7517 EUB             :     }
    7518 GIC     1575911 : }
    7519                 : 
    7520 ECB             : #ifdef USE_PREFETCH
    7521                 : /*
    7522                 :  * Helper function for heap_index_delete_tuples.  Issues prefetch requests for
    7523                 :  * prefetch_count buffers.  The prefetch_state keeps track of all the buffers
    7524                 :  * we can prefetch, and which have already been prefetched; each call to this
    7525                 :  * function picks up where the previous call left off.
    7526                 :  *
    7527                 :  * Note: we expect the deltids array to be sorted in an order that groups TIDs
    7528                 :  * by heap block, with all TIDs for each block appearing together in exactly
    7529                 :  * one group.
    7530                 :  */
    7531                 : static void
    7532 GIC       25415 : index_delete_prefetch_buffer(Relation rel,
    7533                 :                              IndexDeletePrefetchState *prefetch_state,
    7534                 :                              int prefetch_count)
    7535                 : {
    7536           25415 :     BlockNumber cur_hblkno = prefetch_state->cur_hblkno;
    7537           25415 :     int         count = 0;
    7538 ECB             :     int         i;
    7539 GIC       25415 :     int         ndeltids = prefetch_state->ndeltids;
    7540           25415 :     TM_IndexDelete *deltids = prefetch_state->deltids;
    7541                 : 
    7542           25415 :     for (i = prefetch_state->next_item;
    7543          879445 :          i < ndeltids && count < prefetch_count;
    7544          854030 :          i++)
    7545 ECB             :     {
    7546 GIC      854030 :         ItemPointer htid = &deltids[i].tid;
    7547                 : 
    7548 CBC     1699780 :         if (cur_hblkno == InvalidBlockNumber ||
    7549          845750 :             ItemPointerGetBlockNumber(htid) != cur_hblkno)
    7550                 :         {
    7551           25166 :             cur_hblkno = ItemPointerGetBlockNumber(htid);
    7552           25166 :             PrefetchBuffer(rel, MAIN_FORKNUM, cur_hblkno);
    7553           25166 :             count++;
    7554 ECB             :         }
    7555                 :     }
    7556                 : 
    7557                 :     /*
    7558                 :      * Save the prefetch position so that next time we can continue from that
    7559                 :      * position.
    7560                 :      */
    7561 CBC       25415 :     prefetch_state->next_item = i;
    7562           25415 :     prefetch_state->cur_hblkno = cur_hblkno;
    7563 GIC       25415 : }
    7564 ECB             : #endif
    7565                 : 
    7566                 : /*
    7567                 :  * Helper function for heap_index_delete_tuples.  Checks for index corruption
    7568                 :  * involving an invalid TID in index AM caller's index page.
    7569                 :  *
    7570                 :  * This is an ideal place for these checks.  The index AM must hold a buffer
    7571                 :  * lock on the index page containing the TIDs we examine here, so we don't
    7572                 :  * have to worry about concurrent VACUUMs at all.  We can be sure that the
    7573                 :  * index is corrupt when htid points directly to an LP_UNUSED item or
    7574                 :  * heap-only tuple, which is not the case during standard index scans.
    7575                 :  */
    7576                 : static inline void
    7577 GIC      621751 : index_delete_check_htid(TM_IndexDeleteOp *delstate,
    7578                 :                         Page page, OffsetNumber maxoff,
    7579 ECB             :                         ItemPointer htid, TM_IndexStatus *istatus)
    7580                 : {
    7581 GIC      621751 :     OffsetNumber indexpagehoffnum = ItemPointerGetOffsetNumber(htid);
    7582 EUB             :     ItemId      iid;
    7583                 : 
    7584 GIC      621751 :     Assert(OffsetNumberIsValid(istatus->idxoffnum));
    7585 EUB             : 
    7586 GIC      621751 :     if (unlikely(indexpagehoffnum > maxoff))
    7587 UIC           0 :         ereport(ERROR,
    7588                 :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    7589                 :                  errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
    7590                 :                                  ItemPointerGetBlockNumber(htid),
    7591                 :                                  indexpagehoffnum,
    7592                 :                                  istatus->idxoffnum, delstate->iblknum,
    7593 ECB             :                                  RelationGetRelationName(delstate->irel))));
    7594                 : 
    7595 CBC      621751 :     iid = PageGetItemId(page, indexpagehoffnum);
    7596          621751 :     if (unlikely(!ItemIdIsUsed(iid)))
    7597 LBC           0 :         ereport(ERROR,
    7598                 :                 (errcode(ERRCODE_INDEX_CORRUPTED),
    7599                 :                  errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
    7600 ECB             :                                  ItemPointerGetBlockNumber(htid),
    7601                 :                                  indexpagehoffnum,
    7602                 :                                  istatus->idxoffnum, delstate->iblknum,
    7603                 :                                  RelationGetRelationName(delstate->irel))));
    7604                 : 
    7605 CBC      621751 :     if (ItemIdHasStorage(iid))
    7606 ECB             :     {
    7607                 :         HeapTupleHeader htup;
    7608 EUB             : 
    7609 CBC      389368 :         Assert(ItemIdIsNormal(iid));
    7610 GBC      389368 :         htup = (HeapTupleHeader) PageGetItem(page, iid);
    7611                 : 
    7612 CBC      389368 :         if (unlikely(HeapTupleHeaderIsHeapOnly(htup)))
    7613 LBC           0 :             ereport(ERROR,
    7614                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    7615                 :                      errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
    7616 ECB             :                                      ItemPointerGetBlockNumber(htid),
    7617                 :                                      indexpagehoffnum,
    7618 EUB             :                                      istatus->idxoffnum, delstate->iblknum,
    7619                 :                                      RelationGetRelationName(delstate->irel))));
    7620                 :     }
    7621 GBC      621751 : }
    7622 EUB             : 
    7623                 : /*
    7624                 :  * heapam implementation of tableam's index_delete_tuples interface.
    7625                 :  *
    7626                 :  * This helper function is called by index AMs during index tuple deletion.
    7627                 :  * See tableam header comments for an explanation of the interface implemented
    7628                 :  * here and a general theory of operation.  Note that each call here is either
    7629 ECB             :  * a simple index deletion call, or a bottom-up index deletion call.
    7630                 :  *
    7631                 :  * It's possible for this to generate a fair amount of I/O, since we may be
    7632                 :  * deleting hundreds of tuples from a single index block.  To amortize that
    7633                 :  * cost to some degree, this uses prefetching and combines repeat accesses to
    7634                 :  * the same heap block.
    7635                 :  */
    7636                 : TransactionId
    7637 GIC        8280 : heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
    7638                 : {
    7639                 :     /* Initial assumption is that earlier pruning took care of conflict */
    7640 GNC        8280 :     TransactionId snapshotConflictHorizon = InvalidTransactionId;
    7641 GIC        8280 :     BlockNumber blkno = InvalidBlockNumber;
    7642            8280 :     Buffer      buf = InvalidBuffer;
    7643            8280 :     Page        page = NULL;
    7644            8280 :     OffsetNumber maxoff = InvalidOffsetNumber;
    7645                 :     TransactionId priorXmax;
    7646 ECB             : #ifdef USE_PREFETCH
    7647                 :     IndexDeletePrefetchState prefetch_state;
    7648                 :     int         prefetch_distance;
    7649                 : #endif
    7650                 :     SnapshotData SnapshotNonVacuumable;
    7651 CBC        8280 :     int         finalndeltids = 0,
    7652 GIC        8280 :                 nblocksaccessed = 0;
    7653 ECB             : 
    7654                 :     /* State that's only used in bottom-up index deletion case */
    7655 GBC        8280 :     int         nblocksfavorable = 0;
    7656            8280 :     int         curtargetfreespace = delstate->bottomupfreespace,
    7657 GIC        8280 :                 lastfreespace = 0,
    7658            8280 :                 actualfreespace = 0;
    7659            8280 :     bool        bottomup_final_block = false;
    7660                 : 
    7661            8280 :     InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(rel));
    7662                 : 
    7663                 :     /* Sort caller's deltids array by TID for further processing */
    7664            8280 :     index_delete_sort(delstate);
    7665 ECB             : 
    7666                 :     /*
    7667                 :      * Bottom-up case: resort deltids array in an order attuned to where the
    7668                 :      * greatest number of promising TIDs are to be found, and determine how
    7669                 :      * many blocks from the start of sorted array should be considered
    7670                 :      * favorable.  This will also shrink the deltids array in order to
    7671                 :      * eliminate completely unfavorable blocks up front.
    7672                 :      */
    7673 GIC        8280 :     if (delstate->bottomup)
    7674            3512 :         nblocksfavorable = bottomup_sort_and_shrink(delstate);
    7675                 : 
    7676                 : #ifdef USE_PREFETCH
    7677                 :     /* Initialize prefetch state. */
    7678            8280 :     prefetch_state.cur_hblkno = InvalidBlockNumber;
    7679            8280 :     prefetch_state.next_item = 0;
    7680            8280 :     prefetch_state.ndeltids = delstate->ndeltids;
    7681            8280 :     prefetch_state.deltids = delstate->deltids;
    7682                 : 
    7683                 :     /*
    7684                 :      * Determine the prefetch distance that we will attempt to maintain.
    7685                 :      *
    7686 ECB             :      * Since the caller holds a buffer lock somewhere in rel, we'd better make
    7687                 :      * sure that isn't a catalog relation before we call code that does
    7688                 :      * syscache lookups, to avoid risk of deadlock.
    7689                 :      */
    7690 CBC        8280 :     if (IsCatalogRelation(rel))
    7691            6680 :         prefetch_distance = maintenance_io_concurrency;
    7692                 :     else
    7693 ECB             :         prefetch_distance =
    7694 CBC        1600 :             get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace);
    7695                 : 
    7696 ECB             :     /* Cap initial prefetch distance for bottom-up deletion caller */
    7697 CBC        8280 :     if (delstate->bottomup)
    7698 ECB             :     {
    7699 GIC        3512 :         Assert(nblocksfavorable >= 1);
    7700 CBC        3512 :         Assert(nblocksfavorable <= BOTTOMUP_MAX_NBLOCKS);
    7701 GIC        3512 :         prefetch_distance = Min(prefetch_distance, nblocksfavorable);
    7702 ECB             :     }
    7703                 : 
    7704                 :     /* Start prefetching. */
    7705 CBC        8280 :     index_delete_prefetch_buffer(rel, &prefetch_state, prefetch_distance);
    7706 ECB             : #endif
    7707                 : 
    7708                 :     /* Iterate over deltids, determine which to delete, check their horizon */
    7709 GIC        8280 :     Assert(delstate->ndeltids > 0);
    7710          630031 :     for (int i = 0; i < delstate->ndeltids; i++)
    7711                 :     {
    7712          625262 :         TM_IndexDelete *ideltid = &delstate->deltids[i];
    7713          625262 :         TM_IndexStatus *istatus = delstate->status + ideltid->id;
    7714          625262 :         ItemPointer htid = &ideltid->tid;
    7715 ECB             :         OffsetNumber offnum;
    7716                 : 
    7717                 :         /*
    7718                 :          * Read buffer, and perform required extra steps each time a new block
    7719                 :          * is encountered.  Avoid refetching if it's the same block as the one
    7720                 :          * from the last htid.
    7721                 :          */
    7722 GIC     1242244 :         if (blkno == InvalidBlockNumber ||
    7723          616982 :             ItemPointerGetBlockNumber(htid) != blkno)
    7724                 :         {
    7725                 :             /*
    7726                 :              * Consider giving up early for bottom-up index deletion caller
    7727                 :              * first. (Only prefetch next-next block afterwards, when it
    7728                 :              * becomes clear that we're at least going to access the next
    7729                 :              * block in line.)
    7730                 :              *
    7731 ECB             :              * Sometimes the first block frees so much space for bottom-up
    7732                 :              * caller that the deletion process can end without accessing any
    7733                 :              * more blocks.  It is usually necessary to access 2 or 3 blocks
    7734                 :              * per bottom-up deletion operation, though.
    7735                 :              */
    7736 GIC       20646 :             if (delstate->bottomup)
    7737                 :             {
    7738 ECB             :                 /*
    7739                 :                  * We often allow caller to delete a few additional items
    7740                 :                  * whose entries we reached after the point that space target
    7741 EUB             :                  * from caller was satisfied.  The cost of accessing the page
    7742                 :                  * was already paid at that point, so it made sense to finish
    7743                 :                  * it off.  When that happened, we finalize everything here
    7744                 :                  * (by finishing off the whole bottom-up deletion operation
    7745                 :                  * without needlessly paying the cost of accessing any more
    7746                 :                  * blocks).
    7747                 :                  */
    7748 GIC        7439 :                 if (bottomup_final_block)
    7749 CBC         107 :                     break;
    7750 ECB             : 
    7751 EUB             :                 /*
    7752                 :                  * Give up when we didn't enable our caller to free any
    7753                 :                  * additional space as a result of processing the page that we
    7754                 :                  * just finished up with.  This rule is the main way in which
    7755                 :                  * we keep the cost of bottom-up deletion under control.
    7756                 :                  */
    7757 GIC        7332 :                 if (nblocksaccessed >= 1 && actualfreespace == lastfreespace)
    7758            3404 :                     break;
    7759 CBC        3928 :                 lastfreespace = actualfreespace;    /* for next time */
    7760                 : 
    7761                 :                 /*
    7762                 :                  * Deletion operation (which is bottom-up) will definitely
    7763 ECB             :                  * access the next block in line.  Prepare for that now.
    7764                 :                  *
    7765                 :                  * Decay target free space so that we don't hang on for too
    7766                 :                  * long with a marginal case. (Space target is only truly
    7767 EUB             :                  * helpful when it allows us to recognize that we don't need
    7768                 :                  * to access more than 1 or 2 blocks to satisfy caller due to
    7769                 :                  * agreeable workload characteristics.)
    7770                 :                  *
    7771                 :                  * We are a bit more patient when we encounter contiguous
    7772                 :                  * blocks, though: these are treated as favorable blocks.  The
    7773                 :                  * decay process is only applied when the next block in line
    7774                 :                  * is not a favorable/contiguous block.  This is not an
    7775 ECB             :                  * exception to the general rule; we still insist on finding
    7776                 :                  * at least one deletable item per block accessed.  See
    7777                 :                  * bottomup_nblocksfavorable() for full details of the theory
    7778                 :                  * behind favorable blocks and heap block locality in general.
    7779                 :                  *
    7780                 :                  * Note: The first block in line is always treated as a
    7781                 :                  * favorable block, so the earliest possible point that the
    7782                 :                  * decay can be applied is just before we access the second
    7783                 :                  * block in line.  The Assert() verifies this for us.
    7784                 :                  */
    7785 GIC        3928 :                 Assert(nblocksaccessed > 0 || nblocksfavorable > 0);
    7786            3928 :                 if (nblocksfavorable > 0)
    7787            3565 :                     nblocksfavorable--;
    7788                 :                 else
    7789             363 :                     curtargetfreespace /= 2;
    7790                 :             }
    7791 ECB             : 
    7792                 :             /* release old buffer */
    7793 GIC       17135 :             if (BufferIsValid(buf))
    7794 CBC        8855 :                 UnlockReleaseBuffer(buf);
    7795 ECB             : 
    7796 CBC       17135 :             blkno = ItemPointerGetBlockNumber(htid);
    7797           17135 :             buf = ReadBuffer(rel, blkno);
    7798           17135 :             nblocksaccessed++;
    7799 GIC       17135 :             Assert(!delstate->bottomup ||
    7800                 :                    nblocksaccessed <= BOTTOMUP_MAX_NBLOCKS);
    7801                 : 
    7802                 : #ifdef USE_PREFETCH
    7803                 : 
    7804                 :             /*
    7805 ECB             :              * To maintain the prefetch distance, prefetch one more page for
    7806                 :              * each page we read.
    7807                 :              */
    7808 GIC       17135 :             index_delete_prefetch_buffer(rel, &prefetch_state, 1);
    7809 ECB             : #endif
    7810                 : 
    7811 CBC       17135 :             LockBuffer(buf, BUFFER_LOCK_SHARE);
    7812 ECB             : 
    7813 CBC       17135 :             page = BufferGetPage(buf);
    7814 GIC       17135 :             maxoff = PageGetMaxOffsetNumber(page);
    7815 ECB             :         }
    7816                 : 
    7817                 :         /*
    7818                 :          * In passing, detect index corruption involving an index page with a
    7819                 :          * TID that points to a location in the heap that couldn't possibly be
    7820                 :          * correct.  We only do this with actual TIDs from caller's index page
    7821                 :          * (not items reached by traversing through a HOT chain).
    7822                 :          */
    7823 GIC      621751 :         index_delete_check_htid(delstate, page, maxoff, htid, istatus);
    7824                 : 
    7825          621751 :         if (istatus->knowndeletable)
    7826          125582 :             Assert(!delstate->bottomup && !istatus->promising);
    7827 ECB             :         else
    7828                 :         {
    7829 GIC      496169 :             ItemPointerData tmp = *htid;
    7830                 :             HeapTupleData heapTuple;
    7831                 : 
    7832 ECB             :             /* Are any tuples from this HOT chain non-vacuumable? */
    7833 CBC      496169 :             if (heap_hot_search_buffer(&tmp, rel, buf, &SnapshotNonVacuumable,
    7834 ECB             :                                        &heapTuple, NULL, true))
    7835 CBC      359792 :                 continue;       /* can't delete entry */
    7836                 : 
    7837                 :             /* Caller will delete, since whole HOT chain is vacuumable */
    7838 GIC      136377 :             istatus->knowndeletable = true;
    7839                 : 
    7840                 :             /* Maintain index free space info for bottom-up deletion case */
    7841          136377 :             if (delstate->bottomup)
    7842                 :             {
    7843            8387 :                 Assert(istatus->freespace > 0);
    7844 CBC        8387 :                 actualfreespace += istatus->freespace;
    7845            8387 :                 if (actualfreespace >= curtargetfreespace)
    7846 GIC        3312 :                     bottomup_final_block = true;
    7847                 :             }
    7848 ECB             :         }
    7849                 : 
    7850                 :         /*
    7851                 :          * Maintain snapshotConflictHorizon value for deletion operation as a
    7852                 :          * whole by advancing current value using heap tuple headers.  This is
    7853                 :          * loosely based on the logic for pruning a HOT chain.
    7854                 :          */
    7855 CBC      261959 :         offnum = ItemPointerGetOffsetNumber(htid);
    7856 GIC      261959 :         priorXmax = InvalidTransactionId;   /* cannot check first XMIN */
    7857                 :         for (;;)
    7858           19298 :         {
    7859 ECB             :             ItemId      lp;
    7860                 :             HeapTupleHeader htup;
    7861                 : 
    7862                 :             /* Sanity check (pure paranoia) */
    7863 CBC      281257 :             if (offnum < FirstOffsetNumber)
    7864 LBC           0 :                 break;
    7865                 : 
    7866 ECB             :             /*
    7867                 :              * An offset past the end of page's line pointer array is possible
    7868                 :              * when the array was truncated
    7869                 :              */
    7870 GIC      281257 :             if (offnum > maxoff)
    7871 UIC           0 :                 break;
    7872                 : 
    7873 GIC      281257 :             lp = PageGetItemId(page, offnum);
    7874          281257 :             if (ItemIdIsRedirected(lp))
    7875                 :             {
    7876 CBC        8606 :                 offnum = ItemIdGetRedirect(lp);
    7877            8606 :                 continue;
    7878                 :             }
    7879                 : 
    7880                 :             /*
    7881                 :              * We'll often encounter LP_DEAD line pointers (especially with an
    7882                 :              * entry marked knowndeletable by our caller up front).  No heap
    7883                 :              * tuple headers get examined for an htid that leads us to an
    7884                 :              * LP_DEAD item.  This is okay because the earlier pruning
    7885                 :              * operation that made the line pointer LP_DEAD in the first place
    7886                 :              * must have considered the original tuple header as part of
    7887                 :              * generating its own snapshotConflictHorizon value.
    7888                 :              *
    7889                 :              * Relying on XLOG_HEAP2_PRUNE records like this is the same
    7890 ECB             :              * strategy that index vacuuming uses in all cases.  Index VACUUM
    7891                 :              * WAL records don't even have a snapshotConflictHorizon field of
    7892                 :              * their own for this reason.
    7893                 :              */
    7894 GIC      272651 :             if (!ItemIdIsNormal(lp))
    7895          176529 :                 break;
    7896                 : 
    7897           96122 :             htup = (HeapTupleHeader) PageGetItem(page, lp);
    7898                 : 
    7899                 :             /*
    7900                 :              * Check the tuple XMIN against prior XMAX, if any
    7901                 :              */
    7902 CBC      106814 :             if (TransactionIdIsValid(priorXmax) &&
    7903           10692 :                 !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
    7904 UIC           0 :                 break;
    7905                 : 
    7906 GNC       96122 :             HeapTupleHeaderAdvanceConflictHorizon(htup,
    7907                 :                                                   &snapshotConflictHorizon);
    7908                 : 
    7909                 :             /*
    7910                 :              * If the tuple is not HOT-updated, then we are at the end of this
    7911                 :              * HOT-chain.  No need to visit later tuples from the same update
    7912 ECB             :              * chain (they get their own index entries) -- just move on to
    7913                 :              * next htid from index AM caller.
    7914                 :              */
    7915 GIC       96122 :             if (!HeapTupleHeaderIsHotUpdated(htup))
    7916                 :                 break;
    7917                 : 
    7918                 :             /* Advance to next HOT chain member */
    7919           10692 :             Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
    7920           10692 :             offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
    7921           10692 :             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
    7922                 :         }
    7923                 : 
    7924                 :         /* Enable further/final shrinking of deltids for caller */
    7925          261959 :         finalndeltids = i + 1;
    7926                 :     }
    7927                 : 
    7928            8280 :     UnlockReleaseBuffer(buf);
    7929                 : 
    7930                 :     /*
    7931                 :      * Shrink deltids array to exclude non-deletable entries at the end.  This
    7932                 :      * is not just a minor optimization.  Final deltids array size might be
    7933                 :      * zero for a bottom-up caller.  Index AM is explicitly allowed to rely on
    7934                 :      * ndeltids being zero in all cases with zero total deletable entries.
    7935                 :      */
    7936            8280 :     Assert(finalndeltids > 0 || delstate->bottomup);
    7937            8280 :     delstate->ndeltids = finalndeltids;
    7938                 : 
    7939 GNC        8280 :     return snapshotConflictHorizon;
    7940 ECB             : }
    7941                 : 
    7942                 : /*
    7943                 :  * Specialized inlineable comparison function for index_delete_sort()
    7944                 :  */
    7945                 : static inline int
    7946 GIC    18594991 : index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
    7947                 : {
    7948 CBC    18594991 :     ItemPointer tid1 = &deltid1->tid;
    7949        18594991 :     ItemPointer tid2 = &deltid2->tid;
    7950                 : 
    7951 ECB             :     {
    7952 CBC    18594991 :         BlockNumber blk1 = ItemPointerGetBlockNumber(tid1);
    7953        18594991 :         BlockNumber blk2 = ItemPointerGetBlockNumber(tid2);
    7954 ECB             : 
    7955 GIC    18594991 :         if (blk1 != blk2)
    7956         7685436 :             return (blk1 < blk2) ? -1 : 1;
    7957                 :     }
    7958                 :     {
    7959        10909555 :         OffsetNumber pos1 = ItemPointerGetOffsetNumber(tid1);
    7960        10909555 :         OffsetNumber pos2 = ItemPointerGetOffsetNumber(tid2);
    7961                 : 
    7962        10909555 :         if (pos1 != pos2)
    7963 CBC    10909555 :             return (pos1 < pos2) ? -1 : 1;
    7964                 :     }
    7965                 : 
    7966 LBC           0 :     Assert(false);
    7967                 : 
    7968 ECB             :     return 0;
    7969                 : }
    7970                 : 
    7971                 : /*
    7972                 :  * Sort deltids array from delstate by TID.  This prepares it for further
    7973                 :  * processing by heap_index_delete_tuples().
    7974                 :  *
    7975                 :  * This operation becomes a noticeable consumer of CPU cycles with some
    7976                 :  * workloads, so we go to the trouble of specialization/micro optimization.
    7977                 :  * We use shellsort for this because it's easy to specialize, compiles to
    7978                 :  * relatively few instructions, and is adaptive to presorted inputs/subsets
    7979                 :  * (which are typical here).
    7980                 :  */
    7981                 : static void
    7982 GIC        8280 : index_delete_sort(TM_IndexDeleteOp *delstate)
    7983                 : {
    7984 CBC        8280 :     TM_IndexDelete *deltids = delstate->deltids;
    7985 GIC        8280 :     int         ndeltids = delstate->ndeltids;
    7986            8280 :     int         low = 0;
    7987                 : 
    7988 ECB             :     /*
    7989                 :      * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
    7990                 :      *
    7991                 :      * This implementation is fast with array sizes up to ~4500.  This covers
    7992                 :      * all supported BLCKSZ values.
    7993                 :      */
    7994 GIC        8280 :     const int   gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
    7995                 : 
    7996 ECB             :     /* Think carefully before changing anything here -- keep swaps cheap */
    7997                 :     StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
    7998                 :                      "element size exceeds 8 bytes");
    7999                 : 
    8000 CBC       82800 :     for (int g = 0; g < lengthof(gaps); g++)
    8001 ECB             :     {
    8002 GIC    10647541 :         for (int hi = gaps[g], i = low + hi; i < ndeltids; i++)
    8003                 :         {
    8004        10573021 :             TM_IndexDelete d = deltids[i];
    8005        10573021 :             int         j = i;
    8006                 : 
    8007        19191674 :             while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
    8008                 :             {
    8009         8618653 :                 deltids[j] = deltids[j - hi];
    8010 CBC     8618653 :                 j -= hi;
    8011 ECB             :             }
    8012 GIC    10573021 :             deltids[j] = d;
    8013 ECB             :         }
    8014                 :     }
    8015 GIC        8280 : }
    8016                 : 
    8017                 : /*
    8018 ECB             :  * Returns how many blocks should be considered favorable/contiguous for a
    8019 EUB             :  * bottom-up index deletion pass.  This is a number of heap blocks that starts
    8020                 :  * from and includes the first block in line.
    8021                 :  *
    8022                 :  * There is always at least one favorable block during bottom-up index
    8023                 :  * deletion.  In the worst case (i.e. with totally random heap blocks) the
    8024                 :  * first block in line (the only favorable block) can be thought of as a
    8025 ECB             :  * degenerate array of contiguous blocks that consists of a single block.
    8026 EUB             :  * heap_index_delete_tuples() will expect this.
    8027                 :  *
    8028 ECB             :  * Caller passes blockgroups, a description of the final order that deltids
    8029                 :  * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
    8030                 :  * processing.  Note that deltids need not actually be sorted just yet (caller
    8031                 :  * only passes deltids to us so that we can interpret blockgroups).
    8032                 :  *
    8033                 :  * You might guess that the existence of contiguous blocks cannot matter much,
    8034                 :  * since in general the main factor that determines which blocks we visit is
    8035                 :  * the number of promising TIDs, which is a fixed hint from the index AM.
    8036                 :  * We're not really targeting the general case, though -- the actual goal is
    8037                 :  * to adapt our behavior to a wide variety of naturally occurring conditions.
    8038                 :  * The effects of most of the heuristics we apply are only noticeable in the
    8039                 :  * aggregate, over time and across many _related_ bottom-up index deletion
    8040                 :  * passes.
    8041                 :  *
    8042                 :  * Deeming certain blocks favorable allows heapam to recognize and adapt to
    8043                 :  * workloads where heap blocks visited during bottom-up index deletion can be
    8044                 :  * accessed contiguously, in the sense that each newly visited block is the
    8045                 :  * neighbor of the block that bottom-up deletion just finished processing (or
    8046                 :  * close enough to it).  It will likely be cheaper to access more favorable
    8047                 :  * blocks sooner rather than later (e.g. in this pass, not across a series of
    8048                 :  * related bottom-up passes).  Either way it is probably only a matter of time
    8049                 :  * (or a matter of further correlated version churn) before all blocks that
    8050                 :  * appear together as a single large batch of favorable blocks get accessed by
    8051                 :  * _some_ bottom-up pass.  Large batches of favorable blocks tend to either
    8052                 :  * appear almost constantly or not even once (it all depends on per-index
    8053                 :  * workload characteristics).
    8054                 :  *
    8055                 :  * Note that the blockgroups sort order applies a power-of-two bucketing
    8056                 :  * scheme that creates opportunities for contiguous groups of blocks to get
    8057                 :  * batched together, at least with workloads that are naturally amenable to
    8058                 :  * being driven by heap block locality.  This doesn't just enhance the spatial
    8059 EUB             :  * locality of bottom-up heap block processing in the obvious way.  It also
    8060                 :  * enables temporal locality of access, since sorting by heap block number
    8061 ECB             :  * naturally tends to make the bottom-up processing order deterministic.
    8062                 :  *
    8063                 :  * Consider the following example to get a sense of how temporal locality
    8064                 :  * might matter: There is a heap relation with several indexes, each of which
    8065                 :  * is low to medium cardinality.  It is subject to constant non-HOT updates.
    8066                 :  * The updates are skewed (in one part of the primary key, perhaps).  None of
    8067                 :  * the indexes are logically modified by the UPDATE statements (if they were
    8068                 :  * then bottom-up index deletion would not be triggered in the first place).
    8069                 :  * Naturally, each new round of index tuples (for each heap tuple that gets a
    8070                 :  * heap_update() call) will have the same heap TID in each and every index.
    8071                 :  * Since these indexes are low cardinality and never get logically modified,
    8072                 :  * heapam processing during bottom-up deletion passes will access heap blocks
    8073                 :  * in approximately sequential order.  Temporal locality of access occurs due
    8074                 :  * to bottom-up deletion passes behaving very similarly across each of the
    8075                 :  * indexes at any given moment.  This keeps the number of buffer misses needed
    8076                 :  * to visit heap blocks to a minimum.
    8077                 :  */
    8078                 : static int
    8079 GIC        3512 : bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups,
    8080 ECB             :                           TM_IndexDelete *deltids)
    8081                 : {
    8082 GIC        3512 :     int64       lastblock = -1;
    8083 CBC        3512 :     int         nblocksfavorable = 0;
    8084                 : 
    8085 GIC        3512 :     Assert(nblockgroups >= 1);
    8086            3512 :     Assert(nblockgroups <= BOTTOMUP_MAX_NBLOCKS);
    8087                 : 
    8088                 :     /*
    8089                 :      * We tolerate heap blocks that will be accessed only slightly out of
    8090                 :      * physical order.  Small blips occur when a pair of almost-contiguous
    8091 ECB             :      * blocks happen to fall into different buckets (perhaps due only to a
    8092                 :      * small difference in npromisingtids that the bucketing scheme didn't
    8093                 :      * quite manage to ignore).  We effectively ignore these blips by applying
    8094                 :      * a small tolerance.  The precise tolerance we use is a little arbitrary,
    8095                 :      * but it works well enough in practice.
    8096                 :      */
    8097 GIC       12318 :     for (int b = 0; b < nblockgroups; b++)
    8098                 :     {
    8099           11563 :         IndexDeleteCounts *group = blockgroups + b;
    8100           11563 :         TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
    8101 CBC       11563 :         BlockNumber block = ItemPointerGetBlockNumber(&firstdtid->tid);
    8102                 : 
    8103           11563 :         if (lastblock != -1 &&
    8104            8051 :             ((int64) block < lastblock - BOTTOMUP_TOLERANCE_NBLOCKS ||
    8105 GIC        7393 :              (int64) block > lastblock + BOTTOMUP_TOLERANCE_NBLOCKS))
    8106                 :             break;
    8107 ECB             : 
    8108 CBC        8806 :         nblocksfavorable++;
    8109 GIC        8806 :         lastblock = block;
    8110 ECB             :     }
    8111                 : 
    8112                 :     /* Always indicate that there is at least 1 favorable block */
    8113 GIC        3512 :     Assert(nblocksfavorable >= 1);
    8114 ECB             : 
    8115 CBC        3512 :     return nblocksfavorable;
    8116                 : }
    8117 ECB             : 
    8118                 : /*
    8119                 :  * qsort comparison function for bottomup_sort_and_shrink()
    8120                 :  */
    8121 EUB             : static int
    8122 GIC      247741 : bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
    8123                 : {
    8124          247741 :     const IndexDeleteCounts *group1 = (const IndexDeleteCounts *) arg1;
    8125          247741 :     const IndexDeleteCounts *group2 = (const IndexDeleteCounts *) arg2;
    8126                 : 
    8127                 :     /*
    8128                 :      * Most significant field is npromisingtids (which we invert the order of
    8129                 :      * so as to sort in desc order).
    8130                 :      *
    8131                 :      * Caller should have already normalized npromisingtids fields into
    8132                 :      * power-of-two values (buckets).
    8133                 :      */
    8134          247741 :     if (group1->npromisingtids > group2->npromisingtids)
    8135           15597 :         return -1;
    8136          232144 :     if (group1->npromisingtids < group2->npromisingtids)
    8137 CBC       21578 :         return 1;
    8138                 : 
    8139 ECB             :     /*
    8140                 :      * Tiebreak: desc ntids sort order.
    8141                 :      *
    8142                 :      * We cannot expect power-of-two values for ntids fields.  We should
    8143                 :      * behave as if they were already rounded up for us instead.
    8144                 :      */
    8145 GIC      210566 :     if (group1->ntids != group2->ntids)
    8146                 :     {
    8147          137005 :         uint32      ntids1 = pg_nextpower2_32((uint32) group1->ntids);
    8148          137005 :         uint32      ntids2 = pg_nextpower2_32((uint32) group2->ntids);
    8149 ECB             : 
    8150 GIC      137005 :         if (ntids1 > ntids2)
    8151           22329 :             return -1;
    8152          114676 :         if (ntids1 < ntids2)
    8153           28790 :             return 1;
    8154                 :     }
    8155 ECB             : 
    8156                 :     /*
    8157                 :      * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
    8158                 :      * block in deltids array) order.
    8159                 :      *
    8160                 :      * This is equivalent to sorting in ascending heap block number order
    8161                 :      * (among otherwise equal subsets of the array).  This approach allows us
    8162                 :      * to avoid accessing the out-of-line TID.  (We rely on the assumption
    8163                 :      * that the deltids array was sorted in ascending heap TID order when
    8164                 :      * these offsets to the first TID from each heap block group were formed.)
    8165                 :      */
    8166 GIC      159447 :     if (group1->ifirsttid > group2->ifirsttid)
    8167 CBC       78579 :         return 1;
    8168 GIC       80868 :     if (group1->ifirsttid < group2->ifirsttid)
    8169           80868 :         return -1;
    8170 ECB             : 
    8171 UIC           0 :     pg_unreachable();
    8172                 : 
    8173                 :     return 0;
    8174                 : }
    8175                 : 
    8176                 : /*
    8177                 :  * heap_index_delete_tuples() helper function for bottom-up deletion callers.
    8178                 :  *
    8179                 :  * Sorts deltids array in the order needed for useful processing by bottom-up
    8180                 :  * deletion.  The array should already be sorted in TID order when we're
    8181                 :  * called.  The sort process groups heap TIDs from deltids into heap block
    8182                 :  * groupings.  Earlier/more-promising groups/blocks are usually those that are
    8183                 :  * known to have the most "promising" TIDs.
    8184                 :  *
    8185                 :  * Sets new size of deltids array (ndeltids) in state.  deltids will only have
    8186                 :  * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
    8187                 :  * return.  This often means that deltids will be shrunk to a small fraction
    8188                 :  * of its original size (we eliminate many heap blocks from consideration for
    8189                 :  * caller up front).
    8190                 :  *
    8191                 :  * Returns the number of "favorable" blocks.  See bottomup_nblocksfavorable()
    8192                 :  * for a definition and full details.
    8193                 :  */
    8194                 : static int
    8195 GIC        3512 : bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
    8196                 : {
    8197                 :     IndexDeleteCounts *blockgroups;
    8198                 :     TM_IndexDelete *reordereddeltids;
    8199            3512 :     BlockNumber curblock = InvalidBlockNumber;
    8200            3512 :     int         nblockgroups = 0;
    8201            3512 :     int         ncopied = 0;
    8202            3512 :     int         nblocksfavorable = 0;
    8203                 : 
    8204            3512 :     Assert(delstate->bottomup);
    8205            3512 :     Assert(delstate->ndeltids > 0);
    8206                 : 
    8207                 :     /* Calculate per-heap-block count of TIDs */
    8208            3512 :     blockgroups = palloc(sizeof(IndexDeleteCounts) * delstate->ndeltids);
    8209         1484596 :     for (int i = 0; i < delstate->ndeltids; i++)
    8210                 :     {
    8211         1481084 :         TM_IndexDelete *ideltid = &delstate->deltids[i];
    8212         1481084 :         TM_IndexStatus *istatus = delstate->status + ideltid->id;
    8213         1481084 :         ItemPointer htid = &ideltid->tid;
    8214         1481084 :         bool        promising = istatus->promising;
    8215                 : 
    8216         1481084 :         if (curblock != ItemPointerGetBlockNumber(htid))
    8217                 :         {
    8218                 :             /* New block group */
    8219           57032 :             nblockgroups++;
    8220                 : 
    8221           57032 :             Assert(curblock < ItemPointerGetBlockNumber(htid) ||
    8222                 :                    !BlockNumberIsValid(curblock));
    8223                 : 
    8224           57032 :             curblock = ItemPointerGetBlockNumber(htid);
    8225           57032 :             blockgroups[nblockgroups - 1].ifirsttid = i;
    8226           57032 :             blockgroups[nblockgroups - 1].ntids = 1;
    8227           57032 :             blockgroups[nblockgroups - 1].npromisingtids = 0;
    8228                 :         }
    8229                 :         else
    8230                 :         {
    8231         1424052 :             blockgroups[nblockgroups - 1].ntids++;
    8232                 :         }
    8233                 : 
    8234 CBC     1481084 :         if (promising)
    8235 GIC      246652 :             blockgroups[nblockgroups - 1].npromisingtids++;
    8236                 :     }
    8237 ECB             : 
    8238                 :     /*
    8239                 :      * We're about ready to sort block groups to determine the optimal order
    8240                 :      * for visiting heap blocks.  But before we do, round the number of
    8241                 :      * promising tuples for each block group up to the next power-of-two,
    8242                 :      * unless it is very low (less than 4), in which case we round up to 4.
    8243                 :      * npromisingtids is far too noisy to trust when choosing between a pair
    8244                 :      * of block groups that both have very low values.
    8245                 :      *
    8246                 :      * This scheme divides heap blocks/block groups into buckets.  Each bucket
    8247                 :      * contains blocks that have _approximately_ the same number of promising
    8248                 :      * TIDs as each other.  The goal is to ignore relatively small differences
    8249                 :      * in the total number of promising entries, so that the whole process can
    8250                 :      * give a little weight to heapam factors (like heap block locality)
    8251                 :      * instead.  This isn't a trade-off, really -- we have nothing to lose. It
    8252                 :      * would be foolish to interpret small differences in npromisingtids
    8253                 :      * values as anything more than noise.
    8254                 :      *
    8255                 :      * We tiebreak on nhtids when sorting block group subsets that have the
    8256                 :      * same npromisingtids, but this has the same issues as npromisingtids,
    8257                 :      * and so nhtids is subject to the same power-of-two bucketing scheme. The
    8258                 :      * only reason that we don't fix nhtids in the same way here too is that
    8259                 :      * we'll need accurate nhtids values after the sort.  We handle nhtids
    8260                 :      * bucketization dynamically instead (in the sort comparator).
    8261                 :      *
    8262                 :      * See bottomup_nblocksfavorable() for a full explanation of when and how
    8263                 :      * heap locality/favorable blocks can significantly influence when and how
    8264                 :      * heap blocks are accessed.
    8265                 :      */
    8266 GIC       60544 :     for (int b = 0; b < nblockgroups; b++)
    8267                 :     {
    8268 CBC       57032 :         IndexDeleteCounts *group = blockgroups + b;
    8269                 : 
    8270 ECB             :         /* Better off falling back on nhtids with low npromisingtids */
    8271 GIC       57032 :         if (group->npromisingtids <= 4)
    8272           45762 :             group->npromisingtids = 4;
    8273                 :         else
    8274           11270 :             group->npromisingtids =
    8275           11270 :                 pg_nextpower2_32((uint32) group->npromisingtids);
    8276                 :     }
    8277 ECB             : 
    8278                 :     /* Sort groups and rearrange caller's deltids array */
    8279 CBC        3512 :     qsort(blockgroups, nblockgroups, sizeof(IndexDeleteCounts),
    8280 ECB             :           bottomup_sort_and_shrink_cmp);
    8281 GIC        3512 :     reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
    8282                 : 
    8283            3512 :     nblockgroups = Min(BOTTOMUP_MAX_NBLOCKS, nblockgroups);
    8284                 :     /* Determine number of favorable blocks at the start of final deltids */
    8285            3512 :     nblocksfavorable = bottomup_nblocksfavorable(blockgroups, nblockgroups,
    8286                 :                                                  delstate->deltids);
    8287                 : 
    8288           24010 :     for (int b = 0; b < nblockgroups; b++)
    8289 ECB             :     {
    8290 CBC       20498 :         IndexDeleteCounts *group = blockgroups + b;
    8291           20498 :         TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
    8292 ECB             : 
    8293 GIC       20498 :         memcpy(reordereddeltids + ncopied, firstdtid,
    8294           20498 :                sizeof(TM_IndexDelete) * group->ntids);
    8295           20498 :         ncopied += group->ntids;
    8296                 :     }
    8297                 : 
    8298                 :     /* Copy final grouped and sorted TIDs back into start of caller's array */
    8299            3512 :     memcpy(delstate->deltids, reordereddeltids,
    8300 ECB             :            sizeof(TM_IndexDelete) * ncopied);
    8301 GIC        3512 :     delstate->ndeltids = ncopied;
    8302 ECB             : 
    8303 CBC        3512 :     pfree(reordereddeltids);
    8304 GIC        3512 :     pfree(blockgroups);
    8305 ECB             : 
    8306 CBC        3512 :     return nblocksfavorable;
    8307 ECB             : }
    8308                 : 
    8309                 : /*
    8310                 :  * Perform XLogInsert for a heap-visible operation.  'block' is the block
    8311                 :  * being marked all-visible, and vm_buffer is the buffer containing the
    8312                 :  * corresponding visibility map block.  Both should have already been modified
    8313                 :  * and dirtied.
    8314                 :  *
    8315                 :  * snapshotConflictHorizon comes from the largest xmin on the page being
    8316                 :  * marked all-visible.  REDO routine uses it to generate recovery conflicts.
    8317                 :  *
    8318                 :  * If checksums or wal_log_hints are enabled, we may also generate a full-page
    8319                 :  * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
    8320                 :  * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
    8321                 :  * update the heap page's LSN.
    8322                 :  */
    8323                 : XLogRecPtr
    8324 GNC      154036 : log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer,
    8325                 :                  TransactionId snapshotConflictHorizon, uint8 vmflags)
    8326 ECB             : {
    8327                 :     xl_heap_visible xlrec;
    8328                 :     XLogRecPtr  recptr;
    8329                 :     uint8       flags;
    8330                 : 
    8331 GIC      154036 :     Assert(BufferIsValid(heap_buffer));
    8332 CBC      154036 :     Assert(BufferIsValid(vm_buffer));
    8333 ECB             : 
    8334 GNC      154036 :     xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
    8335 CBC      154036 :     xlrec.flags = vmflags;
    8336 GNC      154036 :     if (RelationIsAccessibleInLogicalDecoding(rel))
    8337              25 :         xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL;
    8338 CBC      154036 :     XLogBeginInsert();
    8339          154036 :     XLogRegisterData((char *) &xlrec, SizeOfHeapVisible);
    8340 ECB             : 
    8341 GIC      154036 :     XLogRegisterBuffer(0, vm_buffer, 0);
    8342 ECB             : 
    8343 GIC      154036 :     flags = REGBUF_STANDARD;
    8344          154036 :     if (!XLogHintBitIsNeeded())
    8345 CBC      145698 :         flags |= REGBUF_NO_IMAGE;
    8346 GIC      154036 :     XLogRegisterBuffer(1, heap_buffer, flags);
    8347 ECB             : 
    8348 GIC      154036 :     recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_VISIBLE);
    8349                 : 
    8350 CBC      154036 :     return recptr;
    8351 ECB             : }
    8352                 : 
    8353                 : /*
    8354                 :  * Perform XLogInsert for a heap-update operation.  Caller must already
    8355                 :  * have modified the buffer(s) and marked them dirty.
    8356                 :  */
    8357                 : static XLogRecPtr
    8358 GIC      407557 : log_heap_update(Relation reln, Buffer oldbuf,
    8359                 :                 Buffer newbuf, HeapTuple oldtup, HeapTuple newtup,
    8360 ECB             :                 HeapTuple old_key_tuple,
    8361                 :                 bool all_visible_cleared, bool new_all_visible_cleared)
    8362                 : {
    8363                 :     xl_heap_update xlrec;
    8364                 :     xl_heap_header xlhdr;
    8365                 :     xl_heap_header xlhdr_idx;
    8366                 :     uint8       info;
    8367                 :     uint16      prefix_suffix[2];
    8368 GIC      407557 :     uint16      prefixlen = 0,
    8369          407557 :                 suffixlen = 0;
    8370                 :     XLogRecPtr  recptr;
    8371          407557 :     Page        page = BufferGetPage(newbuf);
    8372          407557 :     bool        need_tuple_data = RelationIsLogicallyLogged(reln);
    8373                 :     bool        init;
    8374                 :     int         bufflags;
    8375                 : 
    8376                 :     /* Caller should not call me on a non-WAL-logged relation */
    8377          407557 :     Assert(RelationNeedsWAL(reln));
    8378                 : 
    8379          407557 :     XLogBeginInsert();
    8380                 : 
    8381          407557 :     if (HeapTupleIsHeapOnly(newtup))
    8382          212632 :         info = XLOG_HEAP_HOT_UPDATE;
    8383                 :     else
    8384          194925 :         info = XLOG_HEAP_UPDATE;
    8385                 : 
    8386                 :     /*
    8387                 :      * If the old and new tuple are on the same page, we only need to log the
    8388                 :      * parts of the new tuple that were changed.  That saves on the amount of
    8389                 :      * WAL we need to write.  Currently, we just count any unchanged bytes in
    8390                 :      * the beginning and end of the tuple.  That's quick to check, and
    8391                 :      * perfectly covers the common case that only one field is updated.
    8392 ECB             :      *
    8393                 :      * We could do this even if the old and new tuple are on different pages,
    8394                 :      * but only if we don't make a full-page image of the old page, which is
    8395                 :      * difficult to know in advance.  Also, if the old tuple is corrupt for
    8396                 :      * some reason, it would allow the corruption to propagate the new page,
    8397                 :      * so it seems best to avoid.  Under the general assumption that most
    8398                 :      * updates tend to create the new tuple version on the same page, there
    8399                 :      * isn't much to be gained by doing this across pages anyway.
    8400                 :      *
    8401                 :      * Skip this if we're taking a full-page image of the new page, as we
    8402                 :      * don't include the new tuple in the WAL record in that case.  Also
    8403                 :      * disable if wal_level='logical', as logical decoding needs to be able to
    8404                 :      * read the new tuple in whole from the WAL record alone.
    8405                 :      */
    8406 GIC      407557 :     if (oldbuf == newbuf && !need_tuple_data &&
    8407 CBC      187993 :         !XLogCheckBufferNeedsBackup(newbuf))
    8408                 :     {
    8409          187185 :         char       *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
    8410 GIC      187185 :         char       *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
    8411 CBC      187185 :         int         oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
    8412 GIC      187185 :         int         newlen = newtup->t_len - newtup->t_data->t_hoff;
    8413                 : 
    8414 ECB             :         /* Check for common prefix between old and new tuple */
    8415 GIC    18651419 :         for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
    8416 ECB             :         {
    8417 CBC    18603122 :             if (newp[prefixlen] != oldp[prefixlen])
    8418 GIC      138888 :                 break;
    8419 ECB             :         }
    8420                 : 
    8421                 :         /*
    8422                 :          * Storing the length of the prefix takes 2 bytes, so we need to save
    8423                 :          * at least 3 bytes or there's no point.
    8424                 :          */
    8425 CBC      187185 :         if (prefixlen < 3)
    8426 GIC       21858 :             prefixlen = 0;
    8427 ECB             : 
    8428                 :         /* Same for suffix */
    8429 CBC     6371228 :         for (suffixlen = 0; suffixlen < Min(oldlen, newlen) - prefixlen; suffixlen++)
    8430 ECB             :         {
    8431 GIC     6322739 :             if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
    8432 CBC      138696 :                 break;
    8433                 :         }
    8434 GIC      187185 :         if (suffixlen < 3)
    8435           56199 :             suffixlen = 0;
    8436                 :     }
    8437                 : 
    8438                 :     /* Prepare main WAL data chain */
    8439          407557 :     xlrec.flags = 0;
    8440          407557 :     if (all_visible_cleared)
    8441             960 :         xlrec.flags |= XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED;
    8442          407557 :     if (new_all_visible_cleared)
    8443             362 :         xlrec.flags |= XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED;
    8444          407557 :     if (prefixlen > 0)
    8445          165327 :         xlrec.flags |= XLH_UPDATE_PREFIX_FROM_OLD;
    8446          407557 :     if (suffixlen > 0)
    8447          130986 :         xlrec.flags |= XLH_UPDATE_SUFFIX_FROM_OLD;
    8448          407557 :     if (need_tuple_data)
    8449                 :     {
    8450 CBC       84736 :         xlrec.flags |= XLH_UPDATE_CONTAINS_NEW_TUPLE;
    8451 GIC       84736 :         if (old_key_tuple)
    8452                 :         {
    8453             214 :             if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
    8454              76 :                 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_TUPLE;
    8455                 :             else
    8456             138 :                 xlrec.flags |= XLH_UPDATE_CONTAINS_OLD_KEY;
    8457 ECB             :         }
    8458                 :     }
    8459                 : 
    8460                 :     /* If new tuple is the single and first tuple on page... */
    8461 CBC      414488 :     if (ItemPointerGetOffsetNumber(&(newtup->t_self)) == FirstOffsetNumber &&
    8462            6931 :         PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
    8463 ECB             :     {
    8464 CBC        6864 :         info |= XLOG_HEAP_INIT_PAGE;
    8465            6864 :         init = true;
    8466                 :     }
    8467 ECB             :     else
    8468 GIC      400693 :         init = false;
    8469 ECB             : 
    8470                 :     /* Prepare WAL data for the old page */
    8471 CBC      407557 :     xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
    8472          407557 :     xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
    8473 GIC      815114 :     xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
    8474 CBC      407557 :                                               oldtup->t_data->t_infomask2);
    8475                 : 
    8476 ECB             :     /* Prepare WAL data for the new page */
    8477 GIC      407557 :     xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
    8478          407557 :     xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
    8479                 : 
    8480          407557 :     bufflags = REGBUF_STANDARD;
    8481          407557 :     if (init)
    8482            6864 :         bufflags |= REGBUF_WILL_INIT;
    8483          407557 :     if (need_tuple_data)
    8484 CBC       84736 :         bufflags |= REGBUF_KEEP_DATA;
    8485                 : 
    8486 GIC      407557 :     XLogRegisterBuffer(0, newbuf, bufflags);
    8487          407557 :     if (oldbuf != newbuf)
    8488          186055 :         XLogRegisterBuffer(1, oldbuf, REGBUF_STANDARD);
    8489                 : 
    8490          407557 :     XLogRegisterData((char *) &xlrec, SizeOfHeapUpdate);
    8491                 : 
    8492                 :     /*
    8493                 :      * Prepare WAL data for the new tuple.
    8494 ECB             :      */
    8495 CBC      407557 :     if (prefixlen > 0 || suffixlen > 0)
    8496                 :     {
    8497          186819 :         if (prefixlen > 0 && suffixlen > 0)
    8498 ECB             :         {
    8499 GIC      109494 :             prefix_suffix[0] = prefixlen;
    8500          109494 :             prefix_suffix[1] = suffixlen;
    8501          109494 :             XLogRegisterBufData(0, (char *) &prefix_suffix, sizeof(uint16) * 2);
    8502                 :         }
    8503 CBC       77325 :         else if (prefixlen > 0)
    8504                 :         {
    8505           55833 :             XLogRegisterBufData(0, (char *) &prefixlen, sizeof(uint16));
    8506                 :         }
    8507 ECB             :         else
    8508                 :         {
    8509 GIC       21492 :             XLogRegisterBufData(0, (char *) &suffixlen, sizeof(uint16));
    8510 ECB             :         }
    8511                 :     }
    8512                 : 
    8513 GIC      407557 :     xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
    8514          407557 :     xlhdr.t_infomask = newtup->t_data->t_infomask;
    8515          407557 :     xlhdr.t_hoff = newtup->t_data->t_hoff;
    8516          407557 :     Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len);
    8517                 : 
    8518                 :     /*
    8519                 :      * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
    8520                 :      *
    8521                 :      * The 'data' doesn't include the common prefix or suffix.
    8522                 :      */
    8523          407557 :     XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
    8524          407557 :     if (prefixlen == 0)
    8525                 :     {
    8526          242230 :         XLogRegisterBufData(0,
    8527          242230 :                             ((char *) newtup->t_data) + SizeofHeapTupleHeader,
    8528          242230 :                             newtup->t_len - SizeofHeapTupleHeader - suffixlen);
    8529                 :     }
    8530                 :     else
    8531                 :     {
    8532 ECB             :         /*
    8533                 :          * Have to write the null bitmap and data after the common prefix as
    8534                 :          * two separate rdata entries.
    8535                 :          */
    8536                 :         /* bitmap [+ padding] [+ oid] */
    8537 CBC      165327 :         if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
    8538 ECB             :         {
    8539 GIC      165327 :             XLogRegisterBufData(0,
    8540          165327 :                                 ((char *) newtup->t_data) + SizeofHeapTupleHeader,
    8541 CBC      165327 :                                 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
    8542                 :         }
    8543 ECB             : 
    8544                 :         /* data after common prefix */
    8545 GIC      165327 :         XLogRegisterBufData(0,
    8546          165327 :                             ((char *) newtup->t_data) + newtup->t_data->t_hoff + prefixlen,
    8547          165327 :                             newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
    8548                 :     }
    8549                 : 
    8550                 :     /* We need to log a tuple identity */
    8551 CBC      407557 :     if (need_tuple_data && old_key_tuple)
    8552 ECB             :     {
    8553                 :         /* don't really need this, but its more comfy to decode */
    8554 GIC         214 :         xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
    8555 CBC         214 :         xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
    8556 GIC         214 :         xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
    8557 ECB             : 
    8558 CBC         214 :         XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader);
    8559                 : 
    8560 ECB             :         /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
    8561 CBC         214 :         XLogRegisterData((char *) old_key_tuple->t_data + SizeofHeapTupleHeader,
    8562 GIC         214 :                          old_key_tuple->t_len - SizeofHeapTupleHeader);
    8563                 :     }
    8564                 : 
    8565 ECB             :     /* filtering by origin on a row level is much more efficient */
    8566 CBC      407557 :     XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
    8567 ECB             : 
    8568 CBC      407557 :     recptr = XLogInsert(RM_HEAP_ID, info);
    8569 ECB             : 
    8570 CBC      407557 :     return recptr;
    8571 ECB             : }
    8572                 : 
    8573                 : /*
    8574                 :  * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
    8575                 :  *
    8576                 :  * This is only used in wal_level >= WAL_LEVEL_LOGICAL, and only for catalog
    8577                 :  * tuples.
    8578                 :  */
    8579                 : static XLogRecPtr
    8580 CBC       26222 : log_heap_new_cid(Relation relation, HeapTuple tup)
    8581                 : {
    8582 ECB             :     xl_heap_new_cid xlrec;
    8583                 : 
    8584                 :     XLogRecPtr  recptr;
    8585 GIC       26222 :     HeapTupleHeader hdr = tup->t_data;
    8586                 : 
    8587 CBC       26222 :     Assert(ItemPointerIsValid(&tup->t_self));
    8588           26222 :     Assert(tup->t_tableOid != InvalidOid);
    8589                 : 
    8590           26222 :     xlrec.top_xid = GetTopTransactionId();
    8591 GNC       26222 :     xlrec.target_locator = relation->rd_locator;
    8592 GIC       26222 :     xlrec.target_tid = tup->t_self;
    8593                 : 
    8594 ECB             :     /*
    8595                 :      * If the tuple got inserted & deleted in the same TX we definitely have a
    8596                 :      * combo CID, set cmin and cmax.
    8597                 :      */
    8598 CBC       26222 :     if (hdr->t_infomask & HEAP_COMBOCID)
    8599 ECB             :     {
    8600 CBC        2091 :         Assert(!(hdr->t_infomask & HEAP_XMAX_INVALID));
    8601 GIC        2091 :         Assert(!HeapTupleHeaderXminInvalid(hdr));
    8602            2091 :         xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
    8603 CBC        2091 :         xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
    8604            2091 :         xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
    8605                 :     }
    8606 ECB             :     /* No combo CID, so only cmin or cmax can be set by this TX */
    8607                 :     else
    8608                 :     {
    8609                 :         /*
    8610                 :          * Tuple inserted.
    8611                 :          *
    8612                 :          * We need to check for LOCK ONLY because multixacts might be
    8613                 :          * transferred to the new tuple in case of FOR KEY SHARE updates in
    8614                 :          * which case there will be an xmax, although the tuple just got
    8615                 :          * inserted.
    8616                 :          */
    8617 GIC       24131 :         if (hdr->t_infomask & HEAP_XMAX_INVALID ||
    8618            5788 :             HEAP_XMAX_IS_LOCKED_ONLY(hdr->t_infomask))
    8619                 :         {
    8620           18344 :             xlrec.cmin = HeapTupleHeaderGetRawCommandId(hdr);
    8621 CBC       18344 :             xlrec.cmax = InvalidCommandId;
    8622                 :         }
    8623 ECB             :         /* Tuple from a different tx updated or deleted. */
    8624                 :         else
    8625                 :         {
    8626 CBC        5787 :             xlrec.cmin = InvalidCommandId;
    8627            5787 :             xlrec.cmax = HeapTupleHeaderGetRawCommandId(hdr);
    8628                 :         }
    8629           24131 :         xlrec.combocid = InvalidCommandId;
    8630                 :     }
    8631 ECB             : 
    8632                 :     /*
    8633                 :      * Note that we don't need to register the buffer here, because this
    8634                 :      * operation does not modify the page. The insert/update/delete that
    8635                 :      * called us certainly did, but that's WAL-logged separately.
    8636                 :      */
    8637 GIC       26222 :     XLogBeginInsert();
    8638           26222 :     XLogRegisterData((char *) &xlrec, SizeOfHeapNewCid);
    8639 ECB             : 
    8640                 :     /* will be looked at irrespective of origin */
    8641                 : 
    8642 CBC       26222 :     recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_NEW_CID);
    8643                 : 
    8644 GIC       26222 :     return recptr;
    8645                 : }
    8646                 : 
    8647                 : /*
    8648                 :  * Build a heap tuple representing the configured REPLICA IDENTITY to represent
    8649 ECB             :  * the old tuple in an UPDATE or DELETE.
    8650                 :  *
    8651                 :  * Returns NULL if there's no need to log an identity or if there's no suitable
    8652                 :  * key defined.
    8653                 :  *
    8654                 :  * Pass key_required true if any replica identity columns changed value, or if
    8655                 :  * any of them have any external data.  Delete must always pass true.
    8656                 :  *
    8657                 :  * *copy is set to true if the returned tuple is a modified copy rather than
    8658                 :  * the same tuple that was passed in.
    8659                 :  */
    8660                 : static HeapTuple
    8661 GIC     1838966 : ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
    8662                 :                        bool *copy)
    8663 ECB             : {
    8664 GIC     1838966 :     TupleDesc   desc = RelationGetDescr(relation);
    8665 CBC     1838966 :     char        replident = relation->rd_rel->relreplident;
    8666 ECB             :     Bitmapset  *idattrs;
    8667                 :     HeapTuple   key_tuple;
    8668                 :     bool        nulls[MaxHeapAttributeNumber];
    8669                 :     Datum       values[MaxHeapAttributeNumber];
    8670                 : 
    8671 CBC     1838966 :     *copy = false;
    8672 ECB             : 
    8673 CBC     1838966 :     if (!RelationIsLogicallyLogged(relation))
    8674 GIC     1665648 :         return NULL;
    8675                 : 
    8676          173318 :     if (replident == REPLICA_IDENTITY_NOTHING)
    8677 CBC         234 :         return NULL;
    8678                 : 
    8679 GIC      173084 :     if (replident == REPLICA_IDENTITY_FULL)
    8680 ECB             :     {
    8681                 :         /*
    8682                 :          * When logging the entire old tuple, it very well could contain
    8683                 :          * toasted columns. If so, force them to be inlined.
    8684                 :          */
    8685 GIC         297 :         if (HeapTupleHasExternal(tp))
    8686                 :         {
    8687 CBC           4 :             *copy = true;
    8688               4 :             tp = toast_flatten_tuple(tp, desc);
    8689                 :         }
    8690 GIC         297 :         return tp;
    8691                 :     }
    8692 ECB             : 
    8693                 :     /* if the key isn't required and we're only logging the key, we're done */
    8694 CBC      172787 :     if (!key_required)
    8695 GIC       84522 :         return NULL;
    8696 ECB             : 
    8697                 :     /* find out the replica identity columns */
    8698 GIC       88265 :     idattrs = RelationGetIndexAttrBitmap(relation,
    8699                 :                                          INDEX_ATTR_BITMAP_IDENTITY_KEY);
    8700                 : 
    8701                 :     /*
    8702                 :      * If there's no defined replica identity columns, treat as !key_required.
    8703                 :      * (This case should not be reachable from heap_update, since that should
    8704                 :      * calculate key_required accurately.  But heap_delete just passes
    8705                 :      * constant true for key_required, so we can hit this case in deletes.)
    8706 ECB             :      */
    8707 GIC       88265 :     if (bms_is_empty(idattrs))
    8708            6028 :         return NULL;
    8709                 : 
    8710                 :     /*
    8711 ECB             :      * Construct a new tuple containing only the replica identity columns,
    8712                 :      * with nulls elsewhere.  While we're at it, assert that the replica
    8713                 :      * identity columns aren't null.
    8714                 :      */
    8715 GIC       82237 :     heap_deform_tuple(tp, desc, values, nulls);
    8716 ECB             : 
    8717 CBC      326811 :     for (int i = 0; i < desc->natts; i++)
    8718 ECB             :     {
    8719 GIC      244574 :         if (bms_is_member(i + 1 - FirstLowInvalidHeapAttributeNumber,
    8720                 :                           idattrs))
    8721           82246 :             Assert(!nulls[i]);
    8722                 :         else
    8723          162328 :             nulls[i] = true;
    8724 ECB             :     }
    8725                 : 
    8726 CBC       82237 :     key_tuple = heap_form_tuple(desc, values, nulls);
    8727           82237 :     *copy = true;
    8728 ECB             : 
    8729 CBC       82237 :     bms_free(idattrs);
    8730 ECB             : 
    8731                 :     /*
    8732                 :      * If the tuple, which by here only contains indexed columns, still has
    8733                 :      * toasted columns, force them to be inlined. This is somewhat unlikely
    8734                 :      * since there's limits on the size of indexed columns, so we don't
    8735                 :      * duplicate toast_flatten_tuple()s functionality in the above loop over
    8736                 :      * the indexed columns, even if it would be more efficient.
    8737                 :      */
    8738 GIC       82237 :     if (HeapTupleHasExternal(key_tuple))
    8739                 :     {
    8740               4 :         HeapTuple   oldtup = key_tuple;
    8741                 : 
    8742               4 :         key_tuple = toast_flatten_tuple(oldtup, desc);
    8743 CBC           4 :         heap_freetuple(oldtup);
    8744 ECB             :     }
    8745                 : 
    8746 CBC       82237 :     return key_tuple;
    8747 ECB             : }
    8748                 : 
    8749                 : /*
    8750                 :  * Handles XLOG_HEAP2_PRUNE record type.
    8751                 :  *
    8752                 :  * Acquires a full cleanup lock.
    8753                 :  */
    8754                 : static void
    8755 CBC        6391 : heap_xlog_prune(XLogReaderState *record)
    8756                 : {
    8757 GIC        6391 :     XLogRecPtr  lsn = record->EndRecPtr;
    8758            6391 :     xl_heap_prune *xlrec = (xl_heap_prune *) XLogRecGetData(record);
    8759                 :     Buffer      buffer;
    8760                 :     RelFileLocator rlocator;
    8761                 :     BlockNumber blkno;
    8762                 :     XLogRedoAction action;
    8763 ECB             : 
    8764 GNC        6391 :     XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
    8765                 : 
    8766                 :     /*
    8767                 :      * We're about to remove tuples. In Hot Standby mode, ensure that there's
    8768 ECB             :      * no queries running for which the removed tuples are still visible.
    8769                 :      */
    8770 CBC        6391 :     if (InHotStandby)
    8771 GNC        6059 :         ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
    8772            6059 :                                             xlrec->isCatalogRel,
    8773                 :                                             rlocator);
    8774                 : 
    8775                 :     /*
    8776                 :      * If we have a full-page image, restore it (using a cleanup lock) and
    8777                 :      * we're done.
    8778                 :      */
    8779 GIC        6391 :     action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true,
    8780                 :                                            &buffer);
    8781            6391 :     if (action == BLK_NEEDS_REDO)
    8782                 :     {
    8783            5725 :         Page        page = (Page) BufferGetPage(buffer);
    8784                 :         OffsetNumber *end;
    8785                 :         OffsetNumber *redirected;
    8786                 :         OffsetNumber *nowdead;
    8787                 :         OffsetNumber *nowunused;
    8788                 :         int         nredirected;
    8789 ECB             :         int         ndead;
    8790                 :         int         nunused;
    8791                 :         Size        datalen;
    8792                 : 
    8793 CBC        5725 :         redirected = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
    8794                 : 
    8795 GIC        5725 :         nredirected = xlrec->nredirected;
    8796            5725 :         ndead = xlrec->ndead;
    8797            5725 :         end = (OffsetNumber *) ((char *) redirected + datalen);
    8798            5725 :         nowdead = redirected + (nredirected * 2);
    8799 CBC        5725 :         nowunused = nowdead + ndead;
    8800 GIC        5725 :         nunused = (end - nowunused);
    8801 CBC        5725 :         Assert(nunused >= 0);
    8802 ECB             : 
    8803                 :         /* Update all line pointers per the record, and repair fragmentation */
    8804 CBC        5725 :         heap_page_prune_execute(buffer,
    8805 ECB             :                                 redirected, nredirected,
    8806                 :                                 nowdead, ndead,
    8807                 :                                 nowunused, nunused);
    8808                 : 
    8809                 :         /*
    8810                 :          * Note: we don't worry about updating the page's prunability hints.
    8811                 :          * At worst this will cause an extra prune cycle to occur soon.
    8812                 :          */
    8813                 : 
    8814 GIC        5725 :         PageSetLSN(page, lsn);
    8815 CBC        5725 :         MarkBufferDirty(buffer);
    8816 ECB             :     }
    8817                 : 
    8818 CBC        6391 :     if (BufferIsValid(buffer))
    8819                 :     {
    8820 GIC        6391 :         Size        freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
    8821                 : 
    8822 CBC        6391 :         UnlockReleaseBuffer(buffer);
    8823 ECB             : 
    8824                 :         /*
    8825                 :          * After pruning records from a page, it's useful to update the FSM
    8826                 :          * about it, as it may cause the page become target for insertions
    8827                 :          * later even if vacuum decides not to visit it (which is possible if
    8828                 :          * gets marked all-visible.)
    8829                 :          *
    8830                 :          * Do this regardless of a full-page image being applied, since the
    8831                 :          * FSM data is not in the page anyway.
    8832                 :          */
    8833 GNC        6391 :         XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
    8834                 :     }
    8835 CBC        6391 : }
    8836 ECB             : 
    8837                 : /*
    8838                 :  * Handles XLOG_HEAP2_VACUUM record type.
    8839                 :  *
    8840                 :  * Acquires an ordinary exclusive lock only.
    8841                 :  */
    8842                 : static void
    8843 CBC        1348 : heap_xlog_vacuum(XLogReaderState *record)
    8844                 : {
    8845            1348 :     XLogRecPtr  lsn = record->EndRecPtr;
    8846 GIC        1348 :     xl_heap_vacuum *xlrec = (xl_heap_vacuum *) XLogRecGetData(record);
    8847 ECB             :     Buffer      buffer;
    8848                 :     BlockNumber blkno;
    8849                 :     XLogRedoAction action;
    8850                 : 
    8851                 :     /*
    8852                 :      * If we have a full-page image, restore it (without using a cleanup lock)
    8853                 :      * and we're done.
    8854                 :      */
    8855 CBC        1348 :     action = XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, false,
    8856                 :                                            &buffer);
    8857            1348 :     if (action == BLK_NEEDS_REDO)
    8858                 :     {
    8859 GIC        1274 :         Page        page = (Page) BufferGetPage(buffer);
    8860                 :         OffsetNumber *nowunused;
    8861                 :         Size        datalen;
    8862                 :         OffsetNumber *offnum;
    8863                 : 
    8864            1274 :         nowunused = (OffsetNumber *) XLogRecGetBlockData(record, 0, &datalen);
    8865                 : 
    8866 ECB             :         /* Shouldn't be a record unless there's something to do */
    8867 GIC        1274 :         Assert(xlrec->nunused > 0);
    8868 ECB             : 
    8869                 :         /* Update all now-unused line pointers */
    8870 CBC        1274 :         offnum = nowunused;
    8871          132840 :         for (int i = 0; i < xlrec->nunused; i++)
    8872                 :         {
    8873 GIC      131566 :             OffsetNumber off = *offnum++;
    8874 CBC      131566 :             ItemId      lp = PageGetItemId(page, off);
    8875                 : 
    8876 GIC      131566 :             Assert(ItemIdIsDead(lp) && !ItemIdHasStorage(lp));
    8877          131566 :             ItemIdSetUnused(lp);
    8878                 :         }
    8879                 : 
    8880                 :         /* Attempt to truncate line pointer array now */
    8881            1274 :         PageTruncateLinePointerArray(page);
    8882                 : 
    8883 CBC        1274 :         PageSetLSN(page, lsn);
    8884 GIC        1274 :         MarkBufferDirty(buffer);
    8885 ECB             :     }
    8886                 : 
    8887 GIC        1348 :     if (BufferIsValid(buffer))
    8888                 :     {
    8889            1348 :         Size        freespace = PageGetHeapFreeSpace(BufferGetPage(buffer));
    8890                 :         RelFileLocator rlocator;
    8891                 : 
    8892 GNC        1348 :         XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
    8893                 : 
    8894 GIC        1348 :         UnlockReleaseBuffer(buffer);
    8895                 : 
    8896                 :         /*
    8897                 :          * After vacuuming LP_DEAD items from a page, it's useful to update
    8898 ECB             :          * the FSM about it, as it may cause the page become target for
    8899                 :          * insertions later even if vacuum decides not to visit it (which is
    8900                 :          * possible if gets marked all-visible.)
    8901                 :          *
    8902                 :          * Do this regardless of a full-page image being applied, since the
    8903                 :          * FSM data is not in the page anyway.
    8904                 :          */
    8905 GNC        1348 :         XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
    8906                 :     }
    8907 CBC        1348 : }
    8908                 : 
    8909 ECB             : /*
    8910                 :  * Replay XLOG_HEAP2_VISIBLE record.
    8911                 :  *
    8912                 :  * The critical integrity requirement here is that we must never end up with
    8913                 :  * a situation where the visibility map bit is set, and the page-level
    8914                 :  * PD_ALL_VISIBLE bit is clear.  If that were to occur, then a subsequent
    8915                 :  * page modification would fail to clear the visibility map bit.
    8916                 :  */
    8917                 : static void
    8918 GIC        3638 : heap_xlog_visible(XLogReaderState *record)
    8919                 : {
    8920            3638 :     XLogRecPtr  lsn = record->EndRecPtr;
    8921 CBC        3638 :     xl_heap_visible *xlrec = (xl_heap_visible *) XLogRecGetData(record);
    8922 GIC        3638 :     Buffer      vmbuffer = InvalidBuffer;
    8923 ECB             :     Buffer      buffer;
    8924                 :     Page        page;
    8925                 :     RelFileLocator rlocator;
    8926                 :     BlockNumber blkno;
    8927                 :     XLogRedoAction action;
    8928                 : 
    8929 GNC        3638 :     Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags);
    8930                 : 
    8931            3638 :     XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno);
    8932                 : 
    8933                 :     /*
    8934 ECB             :      * If there are any Hot Standby transactions running that have an xmin
    8935                 :      * horizon old enough that this page isn't all-visible for them, they
    8936                 :      * might incorrectly decide that an index-only scan can skip a heap fetch.
    8937                 :      *
    8938                 :      * NB: It might be better to throw some kind of "soft" conflict here that
    8939                 :      * forces any index-only scan that is in flight to perform heap fetches,
    8940                 :      * rather than killing the transaction outright.
    8941                 :      */
    8942 GIC        3638 :     if (InHotStandby)
    8943 GNC        3509 :         ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
    8944            3509 :                                             xlrec->flags & VISIBILITYMAP_XLOG_CATALOG_REL,
    8945                 :                                             rlocator);
    8946 ECB             : 
    8947                 :     /*
    8948                 :      * Read the heap page, if it still exists. If the heap file has dropped or
    8949                 :      * truncated later in recovery, we don't need to update the page, but we'd
    8950                 :      * better still update the visibility map.
    8951                 :      */
    8952 CBC        3638 :     action = XLogReadBufferForRedo(record, 1, &buffer);
    8953 GIC        3638 :     if (action == BLK_NEEDS_REDO)
    8954 ECB             :     {
    8955                 :         /*
    8956                 :          * We don't bump the LSN of the heap page when setting the visibility
    8957                 :          * map bit (unless checksums or wal_hint_bits is enabled, in which
    8958                 :          * case we must). This exposes us to torn page hazards, but since
    8959                 :          * we're not inspecting the existing page contents in any way, we
    8960                 :          * don't care.
    8961                 :          */
    8962 GIC        2615 :         page = BufferGetPage(buffer);
    8963                 : 
    8964            2615 :         PageSetAllVisible(page);
    8965                 : 
    8966            2615 :         if (XLogHintBitIsNeeded())
    8967            2570 :             PageSetLSN(page, lsn);
    8968                 : 
    8969 CBC        2615 :         MarkBufferDirty(buffer);
    8970                 :     }
    8971 ECB             :     else if (action == BLK_RESTORED)
    8972                 :     {
    8973                 :         /*
    8974                 :          * If heap block was backed up, we already restored it and there's
    8975                 :          * nothing more to do. (This can only happen with checksums or
    8976                 :          * wal_log_hints enabled.)
    8977                 :          */
    8978                 :     }
    8979                 : 
    8980 GIC        3638 :     if (BufferIsValid(buffer))
    8981 ECB             :     {
    8982 GIC        3593 :         Size        space = PageGetFreeSpace(BufferGetPage(buffer));
    8983 ECB             : 
    8984 GIC        3593 :         UnlockReleaseBuffer(buffer);
    8985 ECB             : 
    8986                 :         /*
    8987                 :          * Since FSM is not WAL-logged and only updated heuristically, it
    8988                 :          * easily becomes stale in standbys.  If the standby is later promoted
    8989                 :          * and runs VACUUM, it will skip updating individual free space
    8990                 :          * figures for pages that became all-visible (or all-frozen, depending
    8991                 :          * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
    8992                 :          * propagates too optimistic free space values to upper FSM layers;
    8993                 :          * later inserters try to use such pages only to find out that they
    8994                 :          * are unusable.  This can cause long stalls when there are many such
    8995                 :          * pages.
    8996                 :          *
    8997                 :          * Forestall those problems by updating FSM's idea about a page that
    8998                 :          * is becoming all-visible or all-frozen.
    8999                 :          *
    9000                 :          * Do this regardless of a full-page image being applied, since the
    9001                 :          * FSM data is not in the page anyway.
    9002                 :          */
    9003 CBC        3593 :         if (xlrec->flags & VISIBILITYMAP_VALID_BITS)
    9004 GNC        3593 :             XLogRecordPageWithFreeSpace(rlocator, blkno, space);
    9005                 :     }
    9006                 : 
    9007 ECB             :     /*
    9008                 :      * Even if we skipped the heap page update due to the LSN interlock, it's
    9009                 :      * still safe to update the visibility map.  Any WAL record that clears
    9010                 :      * the visibility map bit does so before checking the page LSN, so any
    9011                 :      * bits that need to be cleared will still be cleared.
    9012                 :      */
    9013 CBC        3638 :     if (XLogReadBufferForRedoExtended(record, 0, RBM_ZERO_ON_ERROR, false,
    9014                 :                                       &vmbuffer) == BLK_NEEDS_REDO)
    9015 ECB             :     {
    9016 GIC        3443 :         Page        vmpage = BufferGetPage(vmbuffer);
    9017                 :         Relation    reln;
    9018                 :         uint8       vmbits;
    9019 ECB             : 
    9020                 :         /* initialize the page if it was read as zeros */
    9021 CBC        3443 :         if (PageIsNew(vmpage))
    9022 UIC           0 :             PageInit(vmpage, BLCKSZ, 0);
    9023                 : 
    9024                 :         /* remove VISIBILITYMAP_XLOG_* */
    9025 GNC        3443 :         vmbits = xlrec->flags & VISIBILITYMAP_VALID_BITS;
    9026                 : 
    9027                 :         /*
    9028                 :          * XLogReadBufferForRedoExtended locked the buffer. But
    9029                 :          * visibilitymap_set will handle locking itself.
    9030                 :          */
    9031 GIC        3443 :         LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
    9032                 : 
    9033 GNC        3443 :         reln = CreateFakeRelcacheEntry(rlocator);
    9034 GIC        3443 :         visibilitymap_pin(reln, blkno, &vmbuffer);
    9035 ECB             : 
    9036 GNC        3443 :         visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer,
    9037                 :                           xlrec->snapshotConflictHorizon, vmbits);
    9038 ECB             : 
    9039 CBC        3443 :         ReleaseBuffer(vmbuffer);
    9040            3443 :         FreeFakeRelcacheEntry(reln);
    9041                 :     }
    9042 GIC         195 :     else if (BufferIsValid(vmbuffer))
    9043             195 :         UnlockReleaseBuffer(vmbuffer);
    9044            3638 : }
    9045                 : 
    9046                 : /*
    9047 ECB             :  * Replay XLOG_HEAP2_FREEZE_PAGE records
    9048                 :  */
    9049                 : static void
    9050 GIC          90 : heap_xlog_freeze_page(XLogReaderState *record)
    9051                 : {
    9052              90 :     XLogRecPtr  lsn = record->EndRecPtr;
    9053              90 :     xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record);
    9054                 :     Buffer      buffer;
    9055                 : 
    9056                 :     /*
    9057                 :      * In Hot Standby mode, ensure that there's no queries running which still
    9058 ECB             :      * consider the frozen xids as running.
    9059                 :      */
    9060 CBC          90 :     if (InHotStandby)
    9061                 :     {
    9062                 :         RelFileLocator rlocator;
    9063                 : 
    9064 GNC          90 :         XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL);
    9065              90 :         ResolveRecoveryConflictWithSnapshot(xlrec->snapshotConflictHorizon,
    9066              90 :                                             xlrec->isCatalogRel,
    9067                 :                                             rlocator);
    9068 ECB             :     }
    9069                 : 
    9070 GIC          90 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9071                 :     {
    9072              89 :         Page        page = BufferGetPage(buffer);
    9073                 :         xl_heap_freeze_plan *plans;
    9074                 :         OffsetNumber *offsets;
    9075 GNC          89 :         int         curoff = 0;
    9076                 : 
    9077              89 :         plans = (xl_heap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL);
    9078              89 :         offsets = (OffsetNumber *) ((char *) plans +
    9079              89 :                                     (xlrec->nplans *
    9080                 :                                      sizeof(xl_heap_freeze_plan)));
    9081             196 :         for (int p = 0; p < xlrec->nplans; p++)
    9082 ECB             :         {
    9083                 :             HeapTupleFreeze frz;
    9084                 : 
    9085                 :             /*
    9086                 :              * Convert freeze plan representation from WAL record into
    9087                 :              * per-tuple format used by heap_execute_freeze_tuple
    9088                 :              */
    9089 GNC         107 :             frz.xmax = plans[p].xmax;
    9090             107 :             frz.t_infomask2 = plans[p].t_infomask2;
    9091             107 :             frz.t_infomask = plans[p].t_infomask;
    9092             107 :             frz.frzflags = plans[p].frzflags;
    9093             107 :             frz.offset = InvalidOffsetNumber;   /* unused, but be tidy */
    9094                 : 
    9095            2496 :             for (int i = 0; i < plans[p].ntuples; i++)
    9096                 :             {
    9097            2389 :                 OffsetNumber offset = offsets[curoff++];
    9098                 :                 ItemId      lp;
    9099                 :                 HeapTupleHeader tuple;
    9100                 : 
    9101            2389 :                 lp = PageGetItemId(page, offset);
    9102            2389 :                 tuple = (HeapTupleHeader) PageGetItem(page, lp);
    9103            2389 :                 heap_execute_freeze_tuple(tuple, &frz);
    9104                 :             }
    9105                 :         }
    9106                 : 
    9107 GIC          89 :         PageSetLSN(page, lsn);
    9108              89 :         MarkBufferDirty(buffer);
    9109                 :     }
    9110              90 :     if (BufferIsValid(buffer))
    9111 CBC          90 :         UnlockReleaseBuffer(buffer);
    9112 GIC          90 : }
    9113 ECB             : 
    9114                 : /*
    9115                 :  * Given an "infobits" field from an XLog record, set the correct bits in the
    9116                 :  * given infomask and infomask2 for the tuple touched by the record.
    9117                 :  *
    9118                 :  * (This is the reverse of compute_infobits).
    9119                 :  */
    9120                 : static void
    9121 GIC      418292 : fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
    9122                 : {
    9123          418292 :     *infomask &= ~(HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY |
    9124                 :                    HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_EXCL_LOCK);
    9125          418292 :     *infomask2 &= ~HEAP_KEYS_UPDATED;
    9126                 : 
    9127          418292 :     if (infobits & XLHL_XMAX_IS_MULTI)
    9128               2 :         *infomask |= HEAP_XMAX_IS_MULTI;
    9129          418292 :     if (infobits & XLHL_XMAX_LOCK_ONLY)
    9130           54090 :         *infomask |= HEAP_XMAX_LOCK_ONLY;
    9131          418292 :     if (infobits & XLHL_XMAX_EXCL_LOCK)
    9132           53796 :         *infomask |= HEAP_XMAX_EXCL_LOCK;
    9133                 :     /* note HEAP_XMAX_SHR_LOCK isn't considered here */
    9134 CBC      418292 :     if (infobits & XLHL_XMAX_KEYSHR_LOCK)
    9135             305 :         *infomask |= HEAP_XMAX_KEYSHR_LOCK;
    9136                 : 
    9137 GIC      418292 :     if (infobits & XLHL_KEYS_UPDATED)
    9138          278893 :         *infomask2 |= HEAP_KEYS_UPDATED;
    9139          418292 : }
    9140                 : 
    9141                 : static void
    9142          277802 : heap_xlog_delete(XLogReaderState *record)
    9143                 : {
    9144 CBC      277802 :     XLogRecPtr  lsn = record->EndRecPtr;
    9145 GIC      277802 :     xl_heap_delete *xlrec = (xl_heap_delete *) XLogRecGetData(record);
    9146                 :     Buffer      buffer;
    9147 ECB             :     Page        page;
    9148 GIC      277802 :     ItemId      lp = NULL;
    9149                 :     HeapTupleHeader htup;
    9150                 :     BlockNumber blkno;
    9151                 :     RelFileLocator target_locator;
    9152 ECB             :     ItemPointerData target_tid;
    9153 EUB             : 
    9154 GNC      277802 :     XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
    9155 GIC      277802 :     ItemPointerSetBlockNumber(&target_tid, blkno);
    9156 CBC      277802 :     ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
    9157                 : 
    9158                 :     /*
    9159                 :      * The visibility map may need to be fixed even if the heap page is
    9160                 :      * already up-to-date.
    9161                 :      */
    9162          277802 :     if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
    9163                 :     {
    9164 GNC           4 :         Relation    reln = CreateFakeRelcacheEntry(target_locator);
    9165 CBC           4 :         Buffer      vmbuffer = InvalidBuffer;
    9166                 : 
    9167               4 :         visibilitymap_pin(reln, blkno, &vmbuffer);
    9168 GIC           4 :         visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
    9169               4 :         ReleaseBuffer(vmbuffer);
    9170 CBC           4 :         FreeFakeRelcacheEntry(reln);
    9171 ECB             :     }
    9172                 : 
    9173 CBC      277802 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9174 ECB             :     {
    9175 CBC      277729 :         page = BufferGetPage(buffer);
    9176                 : 
    9177 GIC      277729 :         if (PageGetMaxOffsetNumber(page) >= xlrec->offnum)
    9178          277729 :             lp = PageGetItemId(page, xlrec->offnum);
    9179                 : 
    9180          277729 :         if (PageGetMaxOffsetNumber(page) < xlrec->offnum || !ItemIdIsNormal(lp))
    9181 LBC           0 :             elog(PANIC, "invalid lp");
    9182                 : 
    9183 CBC      277729 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9184 ECB             : 
    9185 GIC      277729 :         htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    9186          277729 :         htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    9187          277729 :         HeapTupleHeaderClearHotUpdated(htup);
    9188          277729 :         fix_infomask_from_infobits(xlrec->infobits_set,
    9189                 :                                    &htup->t_infomask, &htup->t_infomask2);
    9190          277729 :         if (!(xlrec->flags & XLH_DELETE_IS_SUPER))
    9191 CBC      277729 :             HeapTupleHeaderSetXmax(htup, xlrec->xmax);
    9192                 :         else
    9193 UIC           0 :             HeapTupleHeaderSetXmin(htup, InvalidTransactionId);
    9194 GIC      277729 :         HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
    9195 ECB             : 
    9196                 :         /* Mark the page as a candidate for pruning */
    9197 CBC      277729 :         PageSetPrunable(page, XLogRecGetXid(record));
    9198                 : 
    9199 GIC      277729 :         if (xlrec->flags & XLH_DELETE_ALL_VISIBLE_CLEARED)
    9200               3 :             PageClearAllVisible(page);
    9201 ECB             : 
    9202                 :         /* Make sure t_ctid is set correctly */
    9203 CBC      277729 :         if (xlrec->flags & XLH_DELETE_IS_PARTITION_MOVE)
    9204 GIC         119 :             HeapTupleHeaderSetMovedPartitions(htup);
    9205                 :         else
    9206 CBC      277610 :             htup->t_ctid = target_tid;
    9207 GIC      277729 :         PageSetLSN(page, lsn);
    9208 CBC      277729 :         MarkBufferDirty(buffer);
    9209 ECB             :     }
    9210 CBC      277802 :     if (BufferIsValid(buffer))
    9211 GIC      277802 :         UnlockReleaseBuffer(buffer);
    9212 CBC      277802 : }
    9213                 : 
    9214                 : static void
    9215 GIC     1197422 : heap_xlog_insert(XLogReaderState *record)
    9216                 : {
    9217         1197422 :     XLogRecPtr  lsn = record->EndRecPtr;
    9218         1197422 :     xl_heap_insert *xlrec = (xl_heap_insert *) XLogRecGetData(record);
    9219                 :     Buffer      buffer;
    9220 ECB             :     Page        page;
    9221                 :     union
    9222                 :     {
    9223                 :         HeapTupleHeaderData hdr;
    9224                 :         char        data[MaxHeapTupleSize];
    9225                 :     }           tbuf;
    9226                 :     HeapTupleHeader htup;
    9227                 :     xl_heap_header xlhdr;
    9228                 :     uint32      newlen;
    9229 GIC     1197422 :     Size        freespace = 0;
    9230                 :     RelFileLocator target_locator;
    9231                 :     BlockNumber blkno;
    9232 ECB             :     ItemPointerData target_tid;
    9233                 :     XLogRedoAction action;
    9234                 : 
    9235 GNC     1197422 :     XLogRecGetBlockTag(record, 0, &target_locator, NULL, &blkno);
    9236 GIC     1197422 :     ItemPointerSetBlockNumber(&target_tid, blkno);
    9237         1197422 :     ItemPointerSetOffsetNumber(&target_tid, xlrec->offnum);
    9238 ECB             : 
    9239                 :     /*
    9240                 :      * The visibility map may need to be fixed even if the heap page is
    9241                 :      * already up-to-date.
    9242                 :      */
    9243 CBC     1197422 :     if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
    9244                 :     {
    9245 GNC         509 :         Relation    reln = CreateFakeRelcacheEntry(target_locator);
    9246 GIC         509 :         Buffer      vmbuffer = InvalidBuffer;
    9247                 : 
    9248             509 :         visibilitymap_pin(reln, blkno, &vmbuffer);
    9249             509 :         visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
    9250             509 :         ReleaseBuffer(vmbuffer);
    9251             509 :         FreeFakeRelcacheEntry(reln);
    9252 ECB             :     }
    9253                 : 
    9254                 :     /*
    9255                 :      * If we inserted the first and only tuple on the page, re-initialize the
    9256                 :      * page from scratch.
    9257                 :      */
    9258 CBC     1197422 :     if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
    9259 ECB             :     {
    9260 CBC       15692 :         buffer = XLogInitBufferForRedo(record, 0);
    9261           15692 :         page = BufferGetPage(buffer);
    9262           15692 :         PageInit(page, BufferGetPageSize(buffer), 0);
    9263           15692 :         action = BLK_NEEDS_REDO;
    9264                 :     }
    9265 ECB             :     else
    9266 CBC     1181730 :         action = XLogReadBufferForRedo(record, 0, &buffer);
    9267 GIC     1197422 :     if (action == BLK_NEEDS_REDO)
    9268 ECB             :     {
    9269                 :         Size        datalen;
    9270                 :         char       *data;
    9271                 : 
    9272 GIC     1196841 :         page = BufferGetPage(buffer);
    9273 ECB             : 
    9274 GIC     1196841 :         if (PageGetMaxOffsetNumber(page) + 1 < xlrec->offnum)
    9275 LBC           0 :             elog(PANIC, "invalid max offset number");
    9276 ECB             : 
    9277 GIC     1196841 :         data = XLogRecGetBlockData(record, 0, &datalen);
    9278                 : 
    9279 CBC     1196841 :         newlen = datalen - SizeOfHeapHeader;
    9280 GIC     1196841 :         Assert(datalen > SizeOfHeapHeader && newlen <= MaxHeapTupleSize);
    9281         1196841 :         memcpy((char *) &xlhdr, data, SizeOfHeapHeader);
    9282         1196841 :         data += SizeOfHeapHeader;
    9283                 : 
    9284         1196841 :         htup = &tbuf.hdr;
    9285 CBC     1196841 :         MemSet((char *) htup, 0, SizeofHeapTupleHeader);
    9286 ECB             :         /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
    9287 CBC     1196841 :         memcpy((char *) htup + SizeofHeapTupleHeader,
    9288                 :                data,
    9289                 :                newlen);
    9290 GIC     1196841 :         newlen += SizeofHeapTupleHeader;
    9291         1196841 :         htup->t_infomask2 = xlhdr.t_infomask2;
    9292         1196841 :         htup->t_infomask = xlhdr.t_infomask;
    9293 CBC     1196841 :         htup->t_hoff = xlhdr.t_hoff;
    9294 GIC     1196841 :         HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
    9295 CBC     1196841 :         HeapTupleHeaderSetCmin(htup, FirstCommandId);
    9296         1196841 :         htup->t_ctid = target_tid;
    9297                 : 
    9298         1196841 :         if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum,
    9299 ECB             :                         true, true) == InvalidOffsetNumber)
    9300 LBC           0 :             elog(PANIC, "failed to add tuple");
    9301 ECB             : 
    9302 GIC     1196841 :         freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
    9303                 : 
    9304 CBC     1196841 :         PageSetLSN(page, lsn);
    9305                 : 
    9306         1196841 :         if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
    9307 GIC         305 :             PageClearAllVisible(page);
    9308 ECB             : 
    9309                 :         /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
    9310 GIC     1196841 :         if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
    9311 LBC           0 :             PageSetAllVisible(page);
    9312 EUB             : 
    9313 GIC     1196841 :         MarkBufferDirty(buffer);
    9314 ECB             :     }
    9315 GIC     1197422 :     if (BufferIsValid(buffer))
    9316 CBC     1197422 :         UnlockReleaseBuffer(buffer);
    9317 ECB             : 
    9318                 :     /*
    9319                 :      * If the page is running low on free space, update the FSM as well.
    9320                 :      * Arbitrarily, our definition of "low" is less than 20%. We can't do much
    9321                 :      * better than that without knowing the fill-factor for the table.
    9322                 :      *
    9323                 :      * XXX: Don't do this if the page was restored from full page image. We
    9324 EUB             :      * don't bother to update the FSM in that case, it doesn't need to be
    9325 ECB             :      * totally accurate anyway.
    9326                 :      */
    9327 GIC     1197422 :     if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
    9328 GNC      237217 :         XLogRecordPageWithFreeSpace(target_locator, blkno, freespace);
    9329 GIC     1197422 : }
    9330 ECB             : 
    9331                 : /*
    9332                 :  * Handles MULTI_INSERT record type.
    9333                 :  */
    9334                 : static void
    9335 CBC       46851 : heap_xlog_multi_insert(XLogReaderState *record)
    9336                 : {
    9337           46851 :     XLogRecPtr  lsn = record->EndRecPtr;
    9338 ECB             :     xl_heap_multi_insert *xlrec;
    9339                 :     RelFileLocator rlocator;
    9340                 :     BlockNumber blkno;
    9341                 :     Buffer      buffer;
    9342                 :     Page        page;
    9343                 :     union
    9344                 :     {
    9345                 :         HeapTupleHeaderData hdr;
    9346                 :         char        data[MaxHeapTupleSize];
    9347                 :     }           tbuf;
    9348                 :     HeapTupleHeader htup;
    9349                 :     uint32      newlen;
    9350 GIC       46851 :     Size        freespace = 0;
    9351                 :     int         i;
    9352           46851 :     bool        isinit = (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE) != 0;
    9353                 :     XLogRedoAction action;
    9354                 : 
    9355                 :     /*
    9356                 :      * Insertion doesn't overwrite MVCC data, so no conflict processing is
    9357                 :      * required.
    9358                 :      */
    9359           46851 :     xlrec = (xl_heap_multi_insert *) XLogRecGetData(record);
    9360 ECB             : 
    9361 GNC       46851 :     XLogRecGetBlockTag(record, 0, &rlocator, NULL, &blkno);
    9362                 : 
    9363                 :     /* check that the mutually exclusive flags are not both set */
    9364 GIC       46851 :     Assert(!((xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED) &&
    9365                 :              (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)));
    9366 ECB             : 
    9367                 :     /*
    9368                 :      * The visibility map may need to be fixed even if the heap page is
    9369                 :      * already up-to-date.
    9370                 :      */
    9371 GIC       46851 :     if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
    9372                 :     {
    9373 GNC         477 :         Relation    reln = CreateFakeRelcacheEntry(rlocator);
    9374 CBC         477 :         Buffer      vmbuffer = InvalidBuffer;
    9375                 : 
    9376             477 :         visibilitymap_pin(reln, blkno, &vmbuffer);
    9377             477 :         visibilitymap_clear(reln, blkno, vmbuffer, VISIBILITYMAP_VALID_BITS);
    9378 GIC         477 :         ReleaseBuffer(vmbuffer);
    9379 CBC         477 :         FreeFakeRelcacheEntry(reln);
    9380 ECB             :     }
    9381                 : 
    9382 CBC       46851 :     if (isinit)
    9383                 :     {
    9384 GIC        1979 :         buffer = XLogInitBufferForRedo(record, 0);
    9385            1979 :         page = BufferGetPage(buffer);
    9386            1979 :         PageInit(page, BufferGetPageSize(buffer), 0);
    9387            1979 :         action = BLK_NEEDS_REDO;
    9388                 :     }
    9389 ECB             :     else
    9390 GIC       44872 :         action = XLogReadBufferForRedo(record, 0, &buffer);
    9391 CBC       46851 :     if (action == BLK_NEEDS_REDO)
    9392 ECB             :     {
    9393                 :         char       *tupdata;
    9394                 :         char       *endptr;
    9395                 :         Size        len;
    9396                 : 
    9397                 :         /* Tuples are stored as block data */
    9398 CBC       46406 :         tupdata = XLogRecGetBlockData(record, 0, &len);
    9399 GIC       46406 :         endptr = tupdata + len;
    9400                 : 
    9401           46406 :         page = (Page) BufferGetPage(buffer);
    9402                 : 
    9403 CBC      236311 :         for (i = 0; i < xlrec->ntuples; i++)
    9404                 :         {
    9405 ECB             :             OffsetNumber offnum;
    9406 EUB             :             xl_multi_insert_tuple *xlhdr;
    9407                 : 
    9408 ECB             :             /*
    9409                 :              * If we're reinitializing the page, the tuples are stored in
    9410                 :              * order from FirstOffsetNumber. Otherwise there's an array of
    9411                 :              * offsets in the WAL record, and the tuples come after that.
    9412                 :              */
    9413 CBC      189905 :             if (isinit)
    9414 GIC       99841 :                 offnum = FirstOffsetNumber + i;
    9415 ECB             :             else
    9416 CBC       90064 :                 offnum = xlrec->offsets[i];
    9417 GIC      189905 :             if (PageGetMaxOffsetNumber(page) + 1 < offnum)
    9418 LBC           0 :                 elog(PANIC, "invalid max offset number");
    9419                 : 
    9420 GIC      189905 :             xlhdr = (xl_multi_insert_tuple *) SHORTALIGN(tupdata);
    9421 CBC      189905 :             tupdata = ((char *) xlhdr) + SizeOfMultiInsertTuple;
    9422 ECB             : 
    9423 CBC      189905 :             newlen = xlhdr->datalen;
    9424          189905 :             Assert(newlen <= MaxHeapTupleSize);
    9425          189905 :             htup = &tbuf.hdr;
    9426          189905 :             MemSet((char *) htup, 0, SizeofHeapTupleHeader);
    9427 ECB             :             /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
    9428 GIC      189905 :             memcpy((char *) htup + SizeofHeapTupleHeader,
    9429 ECB             :                    (char *) tupdata,
    9430                 :                    newlen);
    9431 GBC      189905 :             tupdata += newlen;
    9432                 : 
    9433 CBC      189905 :             newlen += SizeofHeapTupleHeader;
    9434 GIC      189905 :             htup->t_infomask2 = xlhdr->t_infomask2;
    9435 CBC      189905 :             htup->t_infomask = xlhdr->t_infomask;
    9436 GIC      189905 :             htup->t_hoff = xlhdr->t_hoff;
    9437 CBC      189905 :             HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
    9438          189905 :             HeapTupleHeaderSetCmin(htup, FirstCommandId);
    9439 GIC      189905 :             ItemPointerSetBlockNumber(&htup->t_ctid, blkno);
    9440          189905 :             ItemPointerSetOffsetNumber(&htup->t_ctid, offnum);
    9441 ECB             : 
    9442 GBC      189905 :             offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
    9443 GIC      189905 :             if (offnum == InvalidOffsetNumber)
    9444 LBC           0 :                 elog(PANIC, "failed to add tuple");
    9445                 :         }
    9446 CBC       46406 :         if (tupdata != endptr)
    9447 LBC           0 :             elog(PANIC, "total tuple length mismatch");
    9448                 : 
    9449 GIC       46406 :         freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
    9450                 : 
    9451           46406 :         PageSetLSN(page, lsn);
    9452                 : 
    9453           46406 :         if (xlrec->flags & XLH_INSERT_ALL_VISIBLE_CLEARED)
    9454             181 :             PageClearAllVisible(page);
    9455                 : 
    9456                 :         /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
    9457           46406 :         if (xlrec->flags & XLH_INSERT_ALL_FROZEN_SET)
    9458 CBC           4 :             PageSetAllVisible(page);
    9459 ECB             : 
    9460 CBC       46406 :         MarkBufferDirty(buffer);
    9461                 :     }
    9462 GIC       46851 :     if (BufferIsValid(buffer))
    9463           46851 :         UnlockReleaseBuffer(buffer);
    9464                 : 
    9465                 :     /*
    9466 ECB             :      * If the page is running low on free space, update the FSM as well.
    9467                 :      * Arbitrarily, our definition of "low" is less than 20%. We can't do much
    9468                 :      * better than that without knowing the fill-factor for the table.
    9469                 :      *
    9470                 :      * XXX: Don't do this if the page was restored from full page image. We
    9471                 :      * don't bother to update the FSM in that case, it doesn't need to be
    9472                 :      * totally accurate anyway.
    9473                 :      */
    9474 GIC       46851 :     if (action == BLK_NEEDS_REDO && freespace < BLCKSZ / 5)
    9475 GNC       11673 :         XLogRecordPageWithFreeSpace(rlocator, blkno, freespace);
    9476 GIC       46851 : }
    9477                 : 
    9478                 : /*
    9479                 :  * Handles UPDATE and HOT_UPDATE
    9480                 :  */
    9481 ECB             : static void
    9482 GIC       86497 : heap_xlog_update(XLogReaderState *record, bool hot_update)
    9483 ECB             : {
    9484 GIC       86497 :     XLogRecPtr  lsn = record->EndRecPtr;
    9485           86497 :     xl_heap_update *xlrec = (xl_heap_update *) XLogRecGetData(record);
    9486                 :     RelFileLocator rlocator;
    9487                 :     BlockNumber oldblk;
    9488                 :     BlockNumber newblk;
    9489                 :     ItemPointerData newtid;
    9490 ECB             :     Buffer      obuffer,
    9491                 :                 nbuffer;
    9492                 :     Page        page;
    9493                 :     OffsetNumber offnum;
    9494 GIC       86497 :     ItemId      lp = NULL;
    9495 ECB             :     HeapTupleData oldtup;
    9496                 :     HeapTupleHeader htup;
    9497 GIC       86497 :     uint16      prefixlen = 0,
    9498           86497 :                 suffixlen = 0;
    9499                 :     char       *newp;
    9500                 :     union
    9501                 :     {
    9502 ECB             :         HeapTupleHeaderData hdr;
    9503                 :         char        data[MaxHeapTupleSize];
    9504                 :     }           tbuf;
    9505                 :     xl_heap_header xlhdr;
    9506                 :     uint32      newlen;
    9507 CBC       86497 :     Size        freespace = 0;
    9508 ECB             :     XLogRedoAction oldaction;
    9509                 :     XLogRedoAction newaction;
    9510                 : 
    9511                 :     /* initialize to keep the compiler quiet */
    9512 GIC       86497 :     oldtup.t_data = NULL;
    9513 CBC       86497 :     oldtup.t_len = 0;
    9514                 : 
    9515 GNC       86497 :     XLogRecGetBlockTag(record, 0, &rlocator, NULL, &newblk);
    9516 CBC       86497 :     if (XLogRecGetBlockTagExtended(record, 1, NULL, NULL, &oldblk, NULL))
    9517 ECB             :     {
    9518                 :         /* HOT updates are never done across pages */
    9519 GIC       53381 :         Assert(!hot_update);
    9520                 :     }
    9521 ECB             :     else
    9522 CBC       33116 :         oldblk = newblk;
    9523                 : 
    9524 GIC       86497 :     ItemPointerSet(&newtid, newblk, xlrec->new_offnum);
    9525                 : 
    9526                 :     /*
    9527                 :      * The visibility map may need to be fixed even if the heap page is
    9528                 :      * already up-to-date.
    9529 ECB             :      */
    9530 CBC       86497 :     if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
    9531                 :     {
    9532 GNC         245 :         Relation    reln = CreateFakeRelcacheEntry(rlocator);
    9533 GIC         245 :         Buffer      vmbuffer = InvalidBuffer;
    9534 ECB             : 
    9535 GIC         245 :         visibilitymap_pin(reln, oldblk, &vmbuffer);
    9536             245 :         visibilitymap_clear(reln, oldblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
    9537             245 :         ReleaseBuffer(vmbuffer);
    9538             245 :         FreeFakeRelcacheEntry(reln);
    9539                 :     }
    9540                 : 
    9541                 :     /*
    9542                 :      * In normal operation, it is important to lock the two pages in
    9543                 :      * page-number order, to avoid possible deadlocks against other update
    9544 ECB             :      * operations going the other way.  However, during WAL replay there can
    9545                 :      * be no other update happening, so we don't need to worry about that. But
    9546                 :      * we *do* need to worry that we don't expose an inconsistent state to Hot
    9547                 :      * Standby queries --- so the original page can't be unlocked before we've
    9548                 :      * added the new tuple to the new page.
    9549 EUB             :      */
    9550                 : 
    9551 ECB             :     /* Deal with old tuple version */
    9552 CBC       86497 :     oldaction = XLogReadBufferForRedo(record, (oldblk == newblk) ? 0 : 1,
    9553                 :                                       &obuffer);
    9554           86497 :     if (oldaction == BLK_NEEDS_REDO)
    9555 ECB             :     {
    9556 CBC       86473 :         page = BufferGetPage(obuffer);
    9557           86473 :         offnum = xlrec->old_offnum;
    9558 GIC       86473 :         if (PageGetMaxOffsetNumber(page) >= offnum)
    9559 CBC       86473 :             lp = PageGetItemId(page, offnum);
    9560                 : 
    9561 GIC       86473 :         if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    9562 LBC           0 :             elog(PANIC, "invalid lp");
    9563                 : 
    9564 CBC       86473 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9565 ECB             : 
    9566 CBC       86473 :         oldtup.t_data = htup;
    9567           86473 :         oldtup.t_len = ItemIdGetLength(lp);
    9568 ECB             : 
    9569 CBC       86473 :         htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    9570           86473 :         htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    9571           86473 :         if (hot_update)
    9572 GIC       30897 :             HeapTupleHeaderSetHotUpdated(htup);
    9573 ECB             :         else
    9574 CBC       55576 :             HeapTupleHeaderClearHotUpdated(htup);
    9575 GBC       86473 :         fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask,
    9576                 :                                    &htup->t_infomask2);
    9577 CBC       86473 :         HeapTupleHeaderSetXmax(htup, xlrec->old_xmax);
    9578 GBC       86473 :         HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
    9579                 :         /* Set forward chain link in t_ctid */
    9580 CBC       86473 :         htup->t_ctid = newtid;
    9581                 : 
    9582 ECB             :         /* Mark the page as a candidate for pruning */
    9583 GIC       86473 :         PageSetPrunable(page, XLogRecGetXid(record));
    9584 ECB             : 
    9585 CBC       86473 :         if (xlrec->flags & XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED)
    9586 GIC         243 :             PageClearAllVisible(page);
    9587                 : 
    9588 CBC       86473 :         PageSetLSN(page, lsn);
    9589           86473 :         MarkBufferDirty(obuffer);
    9590                 :     }
    9591 ECB             : 
    9592                 :     /*
    9593                 :      * Read the page the new tuple goes into, if different from old.
    9594                 :      */
    9595 GIC       86497 :     if (oldblk == newblk)
    9596                 :     {
    9597           33116 :         nbuffer = obuffer;
    9598           33116 :         newaction = oldaction;
    9599                 :     }
    9600           53381 :     else if (XLogRecGetInfo(record) & XLOG_HEAP_INIT_PAGE)
    9601                 :     {
    9602             594 :         nbuffer = XLogInitBufferForRedo(record, 0);
    9603             594 :         page = (Page) BufferGetPage(nbuffer);
    9604             594 :         PageInit(page, BufferGetPageSize(nbuffer), 0);
    9605 CBC         594 :         newaction = BLK_NEEDS_REDO;
    9606 ECB             :     }
    9607                 :     else
    9608 GIC       52787 :         newaction = XLogReadBufferForRedo(record, 0, &nbuffer);
    9609                 : 
    9610                 :     /*
    9611                 :      * The visibility map may need to be fixed even if the heap page is
    9612                 :      * already up-to-date.
    9613 ECB             :      */
    9614 GIC       86497 :     if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
    9615 ECB             :     {
    9616 GNC          36 :         Relation    reln = CreateFakeRelcacheEntry(rlocator);
    9617 GIC          36 :         Buffer      vmbuffer = InvalidBuffer;
    9618                 : 
    9619              36 :         visibilitymap_pin(reln, newblk, &vmbuffer);
    9620              36 :         visibilitymap_clear(reln, newblk, vmbuffer, VISIBILITYMAP_VALID_BITS);
    9621              36 :         ReleaseBuffer(vmbuffer);
    9622              36 :         FreeFakeRelcacheEntry(reln);
    9623                 :     }
    9624                 : 
    9625 ECB             :     /* Deal with new tuple */
    9626 GIC       86497 :     if (newaction == BLK_NEEDS_REDO)
    9627                 :     {
    9628 ECB             :         char       *recdata;
    9629                 :         char       *recdata_end;
    9630                 :         Size        datalen;
    9631                 :         Size        tuplen;
    9632                 : 
    9633 GIC       86462 :         recdata = XLogRecGetBlockData(record, 0, &datalen);
    9634           86462 :         recdata_end = recdata + datalen;
    9635                 : 
    9636           86462 :         page = BufferGetPage(nbuffer);
    9637                 : 
    9638 CBC       86462 :         offnum = xlrec->new_offnum;
    9639 GIC       86462 :         if (PageGetMaxOffsetNumber(page) + 1 < offnum)
    9640 UIC           0 :             elog(PANIC, "invalid max offset number");
    9641                 : 
    9642 GIC       86462 :         if (xlrec->flags & XLH_UPDATE_PREFIX_FROM_OLD)
    9643 ECB             :         {
    9644 CBC       12522 :             Assert(newblk == oldblk);
    9645 GIC       12522 :             memcpy(&prefixlen, recdata, sizeof(uint16));
    9646 CBC       12522 :             recdata += sizeof(uint16);
    9647 ECB             :         }
    9648 GIC       86462 :         if (xlrec->flags & XLH_UPDATE_SUFFIX_FROM_OLD)
    9649                 :         {
    9650 CBC       29908 :             Assert(newblk == oldblk);
    9651 GIC       29908 :             memcpy(&suffixlen, recdata, sizeof(uint16));
    9652           29908 :             recdata += sizeof(uint16);
    9653 ECB             :         }
    9654                 : 
    9655 CBC       86462 :         memcpy((char *) &xlhdr, recdata, SizeOfHeapHeader);
    9656 GIC       86462 :         recdata += SizeOfHeapHeader;
    9657                 : 
    9658           86462 :         tuplen = recdata_end - recdata;
    9659           86462 :         Assert(tuplen <= MaxHeapTupleSize);
    9660                 : 
    9661 CBC       86462 :         htup = &tbuf.hdr;
    9662 GIC       86462 :         MemSet((char *) htup, 0, SizeofHeapTupleHeader);
    9663 ECB             : 
    9664                 :         /*
    9665                 :          * Reconstruct the new tuple using the prefix and/or suffix from the
    9666                 :          * old tuple, and the data stored in the WAL record.
    9667                 :          */
    9668 CBC       86462 :         newp = (char *) htup + SizeofHeapTupleHeader;
    9669           86462 :         if (prefixlen > 0)
    9670                 :         {
    9671                 :             int         len;
    9672                 : 
    9673                 :             /* copy bitmap [+ padding] [+ oid] from WAL record */
    9674 GIC       12522 :             len = xlhdr.t_hoff - SizeofHeapTupleHeader;
    9675           12522 :             memcpy(newp, recdata, len);
    9676           12522 :             recdata += len;
    9677           12522 :             newp += len;
    9678                 : 
    9679                 :             /* copy prefix from old tuple */
    9680           12522 :             memcpy(newp, (char *) oldtup.t_data + oldtup.t_data->t_hoff, prefixlen);
    9681           12522 :             newp += prefixlen;
    9682                 : 
    9683 ECB             :             /* copy new tuple data from WAL record */
    9684 GIC       12522 :             len = tuplen - (xlhdr.t_hoff - SizeofHeapTupleHeader);
    9685 CBC       12522 :             memcpy(newp, recdata, len);
    9686 GIC       12522 :             recdata += len;
    9687 CBC       12522 :             newp += len;
    9688 ECB             :         }
    9689                 :         else
    9690                 :         {
    9691                 :             /*
    9692                 :              * copy bitmap [+ padding] [+ oid] + data from record, all in one
    9693 EUB             :              * go
    9694                 :              */
    9695 CBC       73940 :             memcpy(newp, recdata, tuplen);
    9696 GIC       73940 :             recdata += tuplen;
    9697 CBC       73940 :             newp += tuplen;
    9698 ECB             :         }
    9699 GIC       86462 :         Assert(recdata == recdata_end);
    9700 ECB             : 
    9701                 :         /* copy suffix from old tuple */
    9702 CBC       86462 :         if (suffixlen > 0)
    9703           29908 :             memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
    9704                 : 
    9705           86462 :         newlen = SizeofHeapTupleHeader + tuplen + prefixlen + suffixlen;
    9706           86462 :         htup->t_infomask2 = xlhdr.t_infomask2;
    9707 GIC       86462 :         htup->t_infomask = xlhdr.t_infomask;
    9708 CBC       86462 :         htup->t_hoff = xlhdr.t_hoff;
    9709 ECB             : 
    9710 GIC       86462 :         HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record));
    9711 CBC       86462 :         HeapTupleHeaderSetCmin(htup, FirstCommandId);
    9712 GIC       86462 :         HeapTupleHeaderSetXmax(htup, xlrec->new_xmax);
    9713                 :         /* Make sure there is no forward chain link in t_ctid */
    9714 CBC       86462 :         htup->t_ctid = newtid;
    9715                 : 
    9716           86462 :         offnum = PageAddItem(page, (Item) htup, newlen, offnum, true, true);
    9717           86462 :         if (offnum == InvalidOffsetNumber)
    9718 UIC           0 :             elog(PANIC, "failed to add tuple");
    9719 ECB             : 
    9720 CBC       86462 :         if (xlrec->flags & XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED)
    9721 GIC          28 :             PageClearAllVisible(page);
    9722                 : 
    9723           86462 :         freespace = PageGetHeapFreeSpace(page); /* needed to update FSM below */
    9724                 : 
    9725           86462 :         PageSetLSN(page, lsn);
    9726 CBC       86462 :         MarkBufferDirty(nbuffer);
    9727                 :     }
    9728 ECB             : 
    9729 CBC       86497 :     if (BufferIsValid(nbuffer) && nbuffer != obuffer)
    9730 GIC       53381 :         UnlockReleaseBuffer(nbuffer);
    9731 CBC       86497 :     if (BufferIsValid(obuffer))
    9732 GIC       86497 :         UnlockReleaseBuffer(obuffer);
    9733 ECB             : 
    9734                 :     /*
    9735                 :      * If the new page is running low on free space, update the FSM as well.
    9736                 :      * Arbitrarily, our definition of "low" is less than 20%. We can't do much
    9737                 :      * better than that without knowing the fill-factor for the table.
    9738                 :      *
    9739                 :      * However, don't update the FSM on HOT updates, because after crash
    9740                 :      * recovery, either the old or the new tuple will certainly be dead and
    9741                 :      * prunable. After pruning, the page will have roughly as much free space
    9742                 :      * as it did before the update, assuming the new tuple is about the same
    9743                 :      * size as the old one.
    9744                 :      *
    9745                 :      * XXX: Don't do this if the page was restored from full page image. We
    9746                 :      * don't bother to update the FSM in that case, it doesn't need to be
    9747                 :      * totally accurate anyway.
    9748                 :      */
    9749 GIC       86497 :     if (newaction == BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
    9750 GNC       11226 :         XLogRecordPageWithFreeSpace(rlocator, newblk, freespace);
    9751 CBC       86497 : }
    9752 ECB             : 
    9753                 : static void
    9754 GIC          61 : heap_xlog_confirm(XLogReaderState *record)
    9755                 : {
    9756              61 :     XLogRecPtr  lsn = record->EndRecPtr;
    9757 CBC          61 :     xl_heap_confirm *xlrec = (xl_heap_confirm *) XLogRecGetData(record);
    9758                 :     Buffer      buffer;
    9759                 :     Page        page;
    9760                 :     OffsetNumber offnum;
    9761 GIC          61 :     ItemId      lp = NULL;
    9762                 :     HeapTupleHeader htup;
    9763                 : 
    9764 CBC          61 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9765 ECB             :     {
    9766 GIC          61 :         page = BufferGetPage(buffer);
    9767 ECB             : 
    9768 GIC          61 :         offnum = xlrec->offnum;
    9769 CBC          61 :         if (PageGetMaxOffsetNumber(page) >= offnum)
    9770              61 :             lp = PageGetItemId(page, offnum);
    9771 EUB             : 
    9772 GIC          61 :         if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    9773 LBC           0 :             elog(PANIC, "invalid lp");
    9774                 : 
    9775 CBC          61 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9776 ECB             : 
    9777                 :         /*
    9778                 :          * Confirm tuple as actually inserted
    9779                 :          */
    9780 GIC          61 :         ItemPointerSet(&htup->t_ctid, BufferGetBlockNumber(buffer), offnum);
    9781 ECB             : 
    9782 CBC          61 :         PageSetLSN(page, lsn);
    9783              61 :         MarkBufferDirty(buffer);
    9784                 :     }
    9785 GIC          61 :     if (BufferIsValid(buffer))
    9786 CBC          61 :         UnlockReleaseBuffer(buffer);
    9787              61 : }
    9788                 : 
    9789 ECB             : static void
    9790 CBC       54149 : heap_xlog_lock(XLogReaderState *record)
    9791                 : {
    9792           54149 :     XLogRecPtr  lsn = record->EndRecPtr;
    9793           54149 :     xl_heap_lock *xlrec = (xl_heap_lock *) XLogRecGetData(record);
    9794                 :     Buffer      buffer;
    9795                 :     Page        page;
    9796                 :     OffsetNumber offnum;
    9797 GIC       54149 :     ItemId      lp = NULL;
    9798                 :     HeapTupleHeader htup;
    9799 ECB             : 
    9800                 :     /*
    9801                 :      * The visibility map may need to be fixed even if the heap page is
    9802                 :      * already up-to-date.
    9803                 :      */
    9804 GIC       54149 :     if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
    9805 ECB             :     {
    9806                 :         RelFileLocator rlocator;
    9807 CBC          17 :         Buffer      vmbuffer = InvalidBuffer;
    9808 ECB             :         BlockNumber block;
    9809                 :         Relation    reln;
    9810                 : 
    9811 GNC          17 :         XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
    9812              17 :         reln = CreateFakeRelcacheEntry(rlocator);
    9813                 : 
    9814 GIC          17 :         visibilitymap_pin(reln, block, &vmbuffer);
    9815 CBC          17 :         visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
    9816 ECB             : 
    9817 CBC          17 :         ReleaseBuffer(vmbuffer);
    9818              17 :         FreeFakeRelcacheEntry(reln);
    9819                 :     }
    9820                 : 
    9821 GIC       54149 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9822                 :     {
    9823           54090 :         page = (Page) BufferGetPage(buffer);
    9824                 : 
    9825           54090 :         offnum = xlrec->offnum;
    9826 CBC       54090 :         if (PageGetMaxOffsetNumber(page) >= offnum)
    9827           54090 :             lp = PageGetItemId(page, offnum);
    9828 ECB             : 
    9829 GIC       54090 :         if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    9830 LBC           0 :             elog(PANIC, "invalid lp");
    9831                 : 
    9832 GIC       54090 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9833 ECB             : 
    9834 CBC       54090 :         htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    9835 GIC       54090 :         htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    9836 CBC       54090 :         fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
    9837 ECB             :                                    &htup->t_infomask2);
    9838                 : 
    9839                 :         /*
    9840                 :          * Clear relevant update flags, but only if the modified infomask says
    9841                 :          * there's no update.
    9842                 :          */
    9843 CBC       54090 :         if (HEAP_XMAX_IS_LOCKED_ONLY(htup->t_infomask))
    9844                 :         {
    9845           54090 :             HeapTupleHeaderClearHotUpdated(htup);
    9846                 :             /* Make sure there is no forward chain link in t_ctid */
    9847           54090 :             ItemPointerSet(&htup->t_ctid,
    9848 ECB             :                            BufferGetBlockNumber(buffer),
    9849 EUB             :                            offnum);
    9850                 :         }
    9851 CBC       54090 :         HeapTupleHeaderSetXmax(htup, xlrec->locking_xid);
    9852           54090 :         HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
    9853 GIC       54090 :         PageSetLSN(page, lsn);
    9854 CBC       54090 :         MarkBufferDirty(buffer);
    9855                 :     }
    9856           54149 :     if (BufferIsValid(buffer))
    9857           54149 :         UnlockReleaseBuffer(buffer);
    9858 GIC       54149 : }
    9859                 : 
    9860 ECB             : static void
    9861 LBC           0 : heap_xlog_lock_updated(XLogReaderState *record)
    9862 ECB             : {
    9863 LBC           0 :     XLogRecPtr  lsn = record->EndRecPtr;
    9864                 :     xl_heap_lock_updated *xlrec;
    9865                 :     Buffer      buffer;
    9866                 :     Page        page;
    9867                 :     OffsetNumber offnum;
    9868 UIC           0 :     ItemId      lp = NULL;
    9869                 :     HeapTupleHeader htup;
    9870                 : 
    9871               0 :     xlrec = (xl_heap_lock_updated *) XLogRecGetData(record);
    9872                 : 
    9873                 :     /*
    9874                 :      * The visibility map may need to be fixed even if the heap page is
    9875                 :      * already up-to-date.
    9876                 :      */
    9877               0 :     if (xlrec->flags & XLH_LOCK_ALL_FROZEN_CLEARED)
    9878                 :     {
    9879                 :         RelFileLocator rlocator;
    9880 LBC           0 :         Buffer      vmbuffer = InvalidBuffer;
    9881 ECB             :         BlockNumber block;
    9882                 :         Relation    reln;
    9883                 : 
    9884 UNC           0 :         XLogRecGetBlockTag(record, 0, &rlocator, NULL, &block);
    9885               0 :         reln = CreateFakeRelcacheEntry(rlocator);
    9886                 : 
    9887 LBC           0 :         visibilitymap_pin(reln, block, &vmbuffer);
    9888               0 :         visibilitymap_clear(reln, block, vmbuffer, VISIBILITYMAP_ALL_FROZEN);
    9889                 : 
    9890 UIC           0 :         ReleaseBuffer(vmbuffer);
    9891               0 :         FreeFakeRelcacheEntry(reln);
    9892 ECB             :     }
    9893                 : 
    9894 UIC           0 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9895 ECB             :     {
    9896 UIC           0 :         page = BufferGetPage(buffer);
    9897 ECB             : 
    9898 UIC           0 :         offnum = xlrec->offnum;
    9899 LBC           0 :         if (PageGetMaxOffsetNumber(page) >= offnum)
    9900               0 :             lp = PageGetItemId(page, offnum);
    9901 ECB             : 
    9902 UIC           0 :         if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    9903 LBC           0 :             elog(PANIC, "invalid lp");
    9904 EUB             : 
    9905 UIC           0 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9906 ECB             : 
    9907 UIC           0 :         htup->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
    9908               0 :         htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
    9909               0 :         fix_infomask_from_infobits(xlrec->infobits_set, &htup->t_infomask,
    9910                 :                                    &htup->t_infomask2);
    9911 LBC           0 :         HeapTupleHeaderSetXmax(htup, xlrec->xmax);
    9912                 : 
    9913               0 :         PageSetLSN(page, lsn);
    9914               0 :         MarkBufferDirty(buffer);
    9915                 :     }
    9916               0 :     if (BufferIsValid(buffer))
    9917               0 :         UnlockReleaseBuffer(buffer);
    9918               0 : }
    9919                 : 
    9920                 : static void
    9921 CBC        6111 : heap_xlog_inplace(XLogReaderState *record)
    9922                 : {
    9923            6111 :     XLogRecPtr  lsn = record->EndRecPtr;
    9924            6111 :     xl_heap_inplace *xlrec = (xl_heap_inplace *) XLogRecGetData(record);
    9925                 :     Buffer      buffer;
    9926                 :     Page        page;
    9927                 :     OffsetNumber offnum;
    9928            6111 :     ItemId      lp = NULL;
    9929                 :     HeapTupleHeader htup;
    9930                 :     uint32      oldlen;
    9931                 :     Size        newlen;
    9932                 : 
    9933 GIC        6111 :     if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
    9934                 :     {
    9935 CBC        6072 :         char       *newtup = XLogRecGetBlockData(record, 0, &newlen);
    9936                 : 
    9937 GIC        6072 :         page = BufferGetPage(buffer);
    9938 ECB             : 
    9939 GIC        6072 :         offnum = xlrec->offnum;
    9940            6072 :         if (PageGetMaxOffsetNumber(page) >= offnum)
    9941            6072 :             lp = PageGetItemId(page, offnum);
    9942 ECB             : 
    9943 CBC        6072 :         if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
    9944 UIC           0 :             elog(PANIC, "invalid lp");
    9945 ECB             : 
    9946 CBC        6072 :         htup = (HeapTupleHeader) PageGetItem(page, lp);
    9947                 : 
    9948            6072 :         oldlen = ItemIdGetLength(lp) - htup->t_hoff;
    9949            6072 :         if (oldlen != newlen)
    9950 UIC           0 :             elog(PANIC, "wrong tuple length");
    9951                 : 
    9952 CBC        6072 :         memcpy((char *) htup + htup->t_hoff, newtup, newlen);
    9953                 : 
    9954            6072 :         PageSetLSN(page, lsn);
    9955 GIC        6072 :         MarkBufferDirty(buffer);
    9956 ECB             :     }
    9957 CBC        6111 :     if (BufferIsValid(buffer))
    9958            6111 :         UnlockReleaseBuffer(buffer);
    9959 GIC        6111 : }
    9960 ECB             : 
    9961 EUB             : void
    9962 GIC     1622042 : heap_redo(XLogReaderState *record)
    9963 ECB             : {
    9964 GIC     1622042 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    9965 ECB             : 
    9966                 :     /*
    9967                 :      * These operations don't overwrite MVCC data so no conflict processing is
    9968                 :      * required. The ones in heap2 rmgr do.
    9969                 :      */
    9970                 : 
    9971 GIC     1622042 :     switch (info & XLOG_HEAP_OPMASK)
    9972                 :     {
    9973         1197422 :         case XLOG_HEAP_INSERT:
    9974 CBC     1197422 :             heap_xlog_insert(record);
    9975 GIC     1197422 :             break;
    9976 CBC      277802 :         case XLOG_HEAP_DELETE:
    9977 GIC      277802 :             heap_xlog_delete(record);
    9978 CBC      277802 :             break;
    9979 GIC       55585 :         case XLOG_HEAP_UPDATE:
    9980           55585 :             heap_xlog_update(record, false);
    9981           55585 :             break;
    9982 LBC           0 :         case XLOG_HEAP_TRUNCATE:
    9983 ECB             : 
    9984                 :             /*
    9985                 :              * TRUNCATE is a no-op because the actions are already logged as
    9986                 :              * SMGR WAL records.  TRUNCATE WAL record only exists for logical
    9987                 :              * decoding.
    9988                 :              */
    9989 LBC           0 :             break;
    9990 GIC       30912 :         case XLOG_HEAP_HOT_UPDATE:
    9991           30912 :             heap_xlog_update(record, true);
    9992 GBC       30912 :             break;
    9993 GIC          61 :         case XLOG_HEAP_CONFIRM:
    9994 GBC          61 :             heap_xlog_confirm(record);
    9995 GIC          61 :             break;
    9996           54149 :         case XLOG_HEAP_LOCK:
    9997           54149 :             heap_xlog_lock(record);
    9998           54149 :             break;
    9999 GBC        6111 :         case XLOG_HEAP_INPLACE:
   10000 GIC        6111 :             heap_xlog_inplace(record);
   10001            6111 :             break;
   10002 UBC           0 :         default:
   10003 UIC           0 :             elog(PANIC, "heap_redo: unknown op code %u", info);
   10004                 :     }
   10005 GIC     1622042 : }
   10006                 : 
   10007                 : void
   10008 GBC       59400 : heap2_redo(XLogReaderState *record)
   10009                 : {
   10010 GIC       59400 :     uint8       info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
   10011 EUB             : 
   10012 GIC       59400 :     switch (info & XLOG_HEAP_OPMASK)
   10013                 :     {
   10014            6391 :         case XLOG_HEAP2_PRUNE:
   10015 GBC        6391 :             heap_xlog_prune(record);
   10016            6391 :             break;
   10017 GIC        1348 :         case XLOG_HEAP2_VACUUM:
   10018 GBC        1348 :             heap_xlog_vacuum(record);
   10019            1348 :             break;
   10020 GIC          90 :         case XLOG_HEAP2_FREEZE_PAGE:
   10021 GBC          90 :             heap_xlog_freeze_page(record);
   10022              90 :             break;
   10023 GIC        3638 :         case XLOG_HEAP2_VISIBLE:
   10024            3638 :             heap_xlog_visible(record);
   10025 GBC        3638 :             break;
   10026 GIC       46851 :         case XLOG_HEAP2_MULTI_INSERT:
   10027 GBC       46851 :             heap_xlog_multi_insert(record);
   10028 GIC       46851 :             break;
   10029 UBC           0 :         case XLOG_HEAP2_LOCK_UPDATED:
   10030               0 :             heap_xlog_lock_updated(record);
   10031               0 :             break;
   10032 GIC        1082 :         case XLOG_HEAP2_NEW_CID:
   10033 EUB             : 
   10034                 :             /*
   10035                 :              * Nothing to do on a real replay, only used during logical
   10036                 :              * decoding.
   10037                 :              */
   10038 GBC        1082 :             break;
   10039 UBC           0 :         case XLOG_HEAP2_REWRITE:
   10040               0 :             heap_xlog_logical_rewrite(record);
   10041 UIC           0 :             break;
   10042 UBC           0 :         default:
   10043 UIC           0 :             elog(PANIC, "heap2_redo: unknown op code %u", info);
   10044 EUB             :     }
   10045 GBC       59400 : }
   10046                 : 
   10047 EUB             : /*
   10048                 :  * Mask a heap page before performing consistency checks on it.
   10049                 :  */
   10050                 : void
   10051 UIC           0 : heap_mask(char *pagedata, BlockNumber blkno)
   10052 ECB             : {
   10053 UIC           0 :     Page        page = (Page) pagedata;
   10054 ECB             :     OffsetNumber off;
   10055                 : 
   10056 UIC           0 :     mask_page_lsn_and_checksum(page);
   10057                 : 
   10058               0 :     mask_page_hint_bits(page);
   10059 LBC           0 :     mask_unused_space(page);
   10060                 : 
   10061 UIC           0 :     for (off = 1; off <= PageGetMaxOffsetNumber(page); off++)
   10062                 :     {
   10063               0 :         ItemId      iid = PageGetItemId(page, off);
   10064 ECB             :         char       *page_item;
   10065                 : 
   10066 LBC           0 :         page_item = (char *) (page + ItemIdGetOffset(iid));
   10067                 : 
   10068               0 :         if (ItemIdIsNormal(iid))
   10069                 :         {
   10070               0 :             HeapTupleHeader page_htup = (HeapTupleHeader) page_item;
   10071 ECB             : 
   10072                 :             /*
   10073                 :              * If xmin of a tuple is not yet frozen, we should ignore
   10074                 :              * differences in hint bits, since they can be set without
   10075 EUB             :              * emitting WAL.
   10076                 :              */
   10077 LBC           0 :             if (!HeapTupleHeaderXminFrozen(page_htup))
   10078 UIC           0 :                 page_htup->t_infomask &= ~HEAP_XACT_MASK;
   10079 ECB             :             else
   10080                 :             {
   10081 EUB             :                 /* Still we need to mask xmax hint bits. */
   10082 UIC           0 :                 page_htup->t_infomask &= ~HEAP_XMAX_INVALID;
   10083 LBC           0 :                 page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED;
   10084                 :             }
   10085 ECB             : 
   10086                 :             /*
   10087                 :              * During replay, we set Command Id to FirstCommandId. Hence, mask
   10088                 :              * it. See heap_xlog_insert() for details.
   10089                 :              */
   10090 LBC           0 :             page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER;
   10091                 : 
   10092                 :             /*
   10093 ECB             :              * For a speculative tuple, heap_insert() does not set ctid in the
   10094                 :              * caller-passed heap tuple itself, leaving the ctid field to
   10095                 :              * contain a speculative token value - a per-backend monotonically
   10096                 :              * increasing identifier. Besides, it does not WAL-log ctid under
   10097                 :              * any circumstances.
   10098                 :              *
   10099                 :              * During redo, heap_xlog_insert() sets t_ctid to current block
   10100                 :              * number and self offset number. It doesn't care about any
   10101                 :              * speculative insertions on the primary. Hence, we set t_ctid to
   10102                 :              * current block number and self offset number to ignore any
   10103                 :              * inconsistency.
   10104                 :              */
   10105 LBC           0 :             if (HeapTupleHeaderIsSpeculative(page_htup))
   10106               0 :                 ItemPointerSet(&page_htup->t_ctid, blkno, off);
   10107 ECB             : 
   10108                 :             /*
   10109                 :              * NB: Not ignoring ctid changes due to the tuple having moved
   10110                 :              * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
   10111                 :              * important information that needs to be in-sync between primary
   10112                 :              * and standby, and thus is WAL logged.
   10113 EUB             :              */
   10114                 :         }
   10115                 : 
   10116                 :         /*
   10117                 :          * Ignore any padding bytes after the tuple, when the length of the
   10118                 :          * item is not MAXALIGNed.
   10119                 :          */
   10120 UBC           0 :         if (ItemIdHasStorage(iid))
   10121 ECB             :         {
   10122 LBC           0 :             int         len = ItemIdGetLength(iid);
   10123               0 :             int         padlen = MAXALIGN(len) - len;
   10124 ECB             : 
   10125 LBC           0 :             if (padlen > 0)
   10126               0 :                 memset(page_item + len, MASK_MARKER, padlen);
   10127 ECB             :         }
   10128                 :     }
   10129 LBC           0 : }
   10130 ECB             : 
   10131                 : /*
   10132                 :  * HeapCheckForSerializableConflictOut
   10133 EUB             :  *      We are reading a tuple.  If it's not visible, there may be a
   10134                 :  *      rw-conflict out with the inserter.  Otherwise, if it is visible to us
   10135                 :  *      but has been deleted, there may be a rw-conflict out with the deleter.
   10136 ECB             :  *
   10137                 :  * We will determine the top level xid of the writing transaction with which
   10138                 :  * we may be in conflict, and ask CheckForSerializableConflictOut() to check
   10139                 :  * for overlap with our own transaction.
   10140                 :  *
   10141                 :  * This function should be called just about anywhere in heapam.c where a
   10142                 :  * tuple has been read. The caller must hold at least a shared lock on the
   10143                 :  * buffer, because this function might set hint bits on the tuple. There is
   10144                 :  * currently no known reason to call this function from an index AM.
   10145                 :  */
   10146                 : void
   10147 CBC   241099374 : HeapCheckForSerializableConflictOut(bool visible, Relation relation,
   10148 ECB             :                                     HeapTuple tuple, Buffer buffer,
   10149                 :                                     Snapshot snapshot)
   10150                 : {
   10151                 :     TransactionId xid;
   10152                 :     HTSV_Result htsvResult;
   10153                 : 
   10154 CBC   241099374 :     if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
   10155       241074039 :         return;
   10156 ECB             : 
   10157                 :     /*
   10158                 :      * Check to see whether the tuple has been written to by a concurrent
   10159                 :      * transaction, either to create it not visible to us, or to delete it
   10160 EUB             :      * while it is visible to us.  The "visible" bool indicates whether the
   10161                 :      * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
   10162                 :      * is going on with it.
   10163 ECB             :      *
   10164                 :      * In the event of a concurrently inserted tuple that also happens to have
   10165                 :      * been concurrently updated (by a separate transaction), the xmin of the
   10166                 :      * tuple will be used -- not the updater's xid.
   10167                 :      */
   10168 GIC       25335 :     htsvResult = HeapTupleSatisfiesVacuum(tuple, TransactionXmin, buffer);
   10169 CBC       25335 :     switch (htsvResult)
   10170 EUB             :     {
   10171 GBC       24533 :         case HEAPTUPLE_LIVE:
   10172           24533 :             if (visible)
   10173           24520 :                 return;
   10174              13 :             xid = HeapTupleHeaderGetXmin(tuple->t_data);
   10175 GIC          13 :             break;
   10176 CBC         352 :         case HEAPTUPLE_RECENTLY_DEAD:
   10177                 :         case HEAPTUPLE_DELETE_IN_PROGRESS:
   10178 GIC         352 :             if (visible)
   10179             281 :                 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
   10180                 :             else
   10181              71 :                 xid = HeapTupleHeaderGetXmin(tuple->t_data);
   10182 EUB             : 
   10183 GIC         352 :             if (TransactionIdPrecedes(xid, TransactionXmin))
   10184 EUB             :             {
   10185                 :                 /* This is like the HEAPTUPLE_DEAD case */
   10186 GIC          62 :                 Assert(!visible);
   10187 GBC          62 :                 return;
   10188                 :             }
   10189             290 :             break;
   10190             326 :         case HEAPTUPLE_INSERT_IN_PROGRESS:
   10191 GIC         326 :             xid = HeapTupleHeaderGetXmin(tuple->t_data);
   10192 GBC         326 :             break;
   10193 GIC         124 :         case HEAPTUPLE_DEAD:
   10194 GBC         124 :             Assert(!visible);
   10195 GIC         124 :             return;
   10196 UIC           0 :         default:
   10197 EUB             : 
   10198                 :             /*
   10199                 :              * The only way to get to this default clause is if a new value is
   10200                 :              * added to the enum type without adding it to this switch
   10201                 :              * statement.  That's a bug, so elog.
   10202                 :              */
   10203 UIC           0 :             elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
   10204                 : 
   10205                 :             /*
   10206                 :              * In spite of having all enum values covered and calling elog on
   10207                 :              * this default, some compilers think this is a code path which
   10208 EUB             :              * allows xid to be used below without initialization. Silence
   10209                 :              * that warning.
   10210                 :              */
   10211                 :             xid = InvalidTransactionId;
   10212                 :     }
   10213                 : 
   10214 GBC         629 :     Assert(TransactionIdIsValid(xid));
   10215 GIC         629 :     Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
   10216                 : 
   10217                 :     /*
   10218                 :      * Find top level xid.  Bail out if xid is too early to be a conflict, or
   10219                 :      * if it's our own xid.
   10220                 :      */
   10221 GBC         629 :     if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
   10222 GIC          62 :         return;
   10223             567 :     xid = SubTransGetTopmostTransaction(xid);
   10224             567 :     if (TransactionIdPrecedes(xid, TransactionXmin))
   10225 UIC           0 :         return;
   10226                 : 
   10227 GIC         567 :     CheckForSerializableConflictOut(relation, xid, snapshot);
   10228                 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a