TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pruneheap.c
4 : * heap page pruning and HOT-chain management code
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/pruneheap.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/heapam.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/htup_details.h"
20 : #include "access/transam.h"
21 : #include "access/xlog.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/catalog.h"
24 : #include "miscadmin.h"
25 : #include "pgstat.h"
26 : #include "storage/bufmgr.h"
27 : #include "utils/snapmgr.h"
28 : #include "utils/rel.h"
29 : #include "utils/snapmgr.h"
30 :
31 : /* Working data for heap_page_prune and subroutines */
32 : typedef struct
33 : {
34 : Relation rel;
35 :
36 : /* tuple visibility test, initialized for the relation */
37 : GlobalVisState *vistest;
38 :
39 : /*
40 : * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
41 : * been computed (done on demand, and only if
42 : * OldSnapshotThresholdActive()). The first time a tuple is about to be
43 : * removed based on the limited horizon, old_snap_used is set to true, and
44 : * SetOldSnapshotThresholdTimestamp() is called. See
45 : * heap_prune_satisfies_vacuum().
46 : */
47 : TimestampTz old_snap_ts;
48 : TransactionId old_snap_xmin;
49 : bool old_snap_used;
50 :
51 : TransactionId new_prune_xid; /* new prune hint value for page */
52 : TransactionId snapshotConflictHorizon; /* latest xid removed */
53 : int nredirected; /* numbers of entries in arrays below */
54 : int ndead;
55 : int nunused;
56 : /* arrays that accumulate indexes of items to be changed */
57 : OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
58 : OffsetNumber nowdead[MaxHeapTuplesPerPage];
59 : OffsetNumber nowunused[MaxHeapTuplesPerPage];
60 :
61 : /*
62 : * marked[i] is true if item i is entered in one of the above arrays.
63 : *
64 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
65 : * 1. Otherwise every access would need to subtract 1.
66 : */
67 : bool marked[MaxHeapTuplesPerPage + 1];
68 :
69 : /*
70 : * Tuple visibility is only computed once for each tuple, for correctness
71 : * and efficiency reasons; see comment in heap_page_prune() for details.
72 : * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
73 : * indicate no visibility has been computed, e.g. for LP_DEAD items.
74 : *
75 : * Same indexing as ->marked.
76 : */
77 : int8 htsv[MaxHeapTuplesPerPage + 1];
78 : } PruneState;
79 :
80 : /* Local functions */
81 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
82 : HeapTuple tup,
83 : Buffer buffer);
84 : static int heap_prune_chain(Buffer buffer,
85 : OffsetNumber rootoffnum,
86 : PruneState *prstate);
87 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
88 : static void heap_prune_record_redirect(PruneState *prstate,
89 : OffsetNumber offnum, OffsetNumber rdoffnum);
90 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
91 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
92 : static void page_verify_redirects(Page page);
93 :
94 :
95 : /*
96 : * Optionally prune and repair fragmentation in the specified page.
97 : *
98 : * This is an opportunistic function. It will perform housekeeping
99 : * only if the page heuristically looks like a candidate for pruning and we
100 : * can acquire buffer cleanup lock without blocking.
101 : *
102 : * Note: this is called quite often. It's important that it fall out quickly
103 : * if there's not any use in pruning.
104 : *
105 : * Caller must have pin on the buffer, and must *not* have a lock on it.
106 : */
107 : void
108 CBC 17752776 : heap_page_prune_opt(Relation relation, Buffer buffer)
109 : {
110 17752776 : Page page = BufferGetPage(buffer);
111 : TransactionId prune_xid;
112 : GlobalVisState *vistest;
113 17752776 : TransactionId limited_xmin = InvalidTransactionId;
114 17752776 : TimestampTz limited_ts = 0;
115 : Size minfree;
116 :
117 : /*
118 : * We can't write WAL in recovery mode, so there's no point trying to
119 : * clean the page. The primary will likely issue a cleaning WAL record
120 : * soon anyway, so this is no particular loss.
121 : */
122 17752776 : if (RecoveryInProgress())
123 16621937 : return;
124 :
125 : /*
126 : * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
127 : * currently are broken, and discussion of what to do about them is
128 : * ongoing. See
129 : * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
130 : */
131 17575599 : if (old_snapshot_threshold == 0)
132 2627 : SnapshotTooOldMagicForTest();
133 :
134 : /*
135 : * First check whether there's any chance there's something to prune,
136 : * determining the appropriate horizon is a waste if there's no prune_xid
137 : * (i.e. no updates/deletes left potentially dead tuples around).
138 : */
139 17575599 : prune_xid = ((PageHeader) page)->pd_prune_xid;
140 17575599 : if (!TransactionIdIsValid(prune_xid))
141 10033748 : return;
142 :
143 : /*
144 : * Check whether prune_xid indicates that there may be dead rows that can
145 : * be cleaned up.
146 : *
147 : * It is OK to check the old snapshot limit before acquiring the cleanup
148 : * lock because the worst that can happen is that we are not quite as
149 : * aggressive about the cleanup (by however many transaction IDs are
150 : * consumed between this point and acquiring the lock). This allows us to
151 : * save significant overhead in the case where the page is found not to be
152 : * prunable.
153 : *
154 : * Even if old_snapshot_threshold is set, we first check whether the page
155 : * can be pruned without. Both because
156 : * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
157 : * unnecessarily relying on old_snapshot_threshold avoids causing
158 : * conflicts.
159 : */
160 7541851 : vistest = GlobalVisTestFor(relation);
161 :
162 7541851 : if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
163 : {
164 6410719 : if (!OldSnapshotThresholdActive())
165 6410698 : return;
166 :
167 21 : if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
168 : relation,
169 : &limited_xmin, &limited_ts))
170 19 : return;
171 :
172 2 : if (!TransactionIdPrecedes(prune_xid, limited_xmin))
173 2 : return;
174 : }
175 :
176 : /*
177 : * We prune when a previous UPDATE failed to find enough space on the page
178 : * for a new tuple version, or when free space falls below the relation's
179 : * fill-factor target (but not less than 10%).
180 : *
181 : * Checking free space here is questionable since we aren't holding any
182 : * lock on the buffer; in the worst case we could get a bogus answer. It's
183 : * unlikely to be *seriously* wrong, though, since reading either pd_lower
184 : * or pd_upper is probably atomic. Avoiding taking a lock seems more
185 : * important than sometimes getting a wrong answer in what is after all
186 : * just a heuristic estimate.
187 : */
188 1131132 : minfree = RelationGetTargetPageFreeSpace(relation,
189 : HEAP_DEFAULT_FILLFACTOR);
190 1131132 : minfree = Max(minfree, BLCKSZ / 10);
191 :
192 1131132 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
193 : {
194 : /* OK, try to get exclusive buffer lock */
195 104122 : if (!ConditionalLockBufferForCleanup(buffer))
196 293 : return;
197 :
198 : /*
199 : * Now that we have buffer lock, get accurate information about the
200 : * page's free space, and recheck the heuristic about whether to
201 : * prune. (We needn't recheck PageIsPrunable, since no one else could
202 : * have pruned while we hold pin.)
203 : */
204 103829 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
205 : {
206 : int ndeleted,
207 : nnewlpdead;
208 :
209 103829 : ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
210 : limited_ts, &nnewlpdead, NULL);
211 :
212 : /*
213 : * Report the number of tuples reclaimed to pgstats. This is
214 : * ndeleted minus the number of newly-LP_DEAD-set items.
215 : *
216 : * We derive the number of dead tuples like this to avoid totally
217 : * forgetting about items that were set to LP_DEAD, since they
218 : * still need to be cleaned up by VACUUM. We only want to count
219 : * heap-only tuples that just became LP_UNUSED in our report,
220 : * which don't.
221 : *
222 : * VACUUM doesn't have to compensate in the same way when it
223 : * tracks ndeleted, since it will set the same LP_DEAD items to
224 : * LP_UNUSED separately.
225 : */
226 103829 : if (ndeleted > nnewlpdead)
227 54279 : pgstat_update_heap_dead_tuples(relation,
228 : ndeleted - nnewlpdead);
229 : }
230 :
231 : /* And release buffer lock */
232 103829 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
233 :
234 : /*
235 : * We avoid reuse of any free space created on the page by unrelated
236 : * UPDATEs/INSERTs by opting to not update the FSM at this point. The
237 : * free space should be reused by UPDATEs to *this* page.
238 : */
239 : }
240 : }
241 :
242 :
243 : /*
244 : * Prune and repair fragmentation in the specified page.
245 : *
246 : * Caller must have pin and buffer cleanup lock on the page. Note that we
247 : * don't update the FSM information for page on caller's behalf. Caller might
248 : * also need to account for a reduction in the length of the line pointer
249 : * array following array truncation by us.
250 : *
251 : * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
252 : * (see heap_prune_satisfies_vacuum and
253 : * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
254 : * either have been set by TransactionIdLimitedForOldSnapshots, or
255 : * InvalidTransactionId/0 respectively.
256 : *
257 : * Sets *nnewlpdead for caller, indicating the number of items that were
258 : * newly set LP_DEAD during prune operation.
259 : *
260 : * off_loc is the offset location required by the caller to use in error
261 : * callback.
262 : *
263 : * Returns the number of tuples deleted from the page during this call.
264 : */
265 : int
266 268918 : heap_page_prune(Relation relation, Buffer buffer,
267 : GlobalVisState *vistest,
268 : TransactionId old_snap_xmin,
269 : TimestampTz old_snap_ts,
270 : int *nnewlpdead,
271 : OffsetNumber *off_loc)
272 : {
273 268918 : int ndeleted = 0;
274 268918 : Page page = BufferGetPage(buffer);
275 268918 : BlockNumber blockno = BufferGetBlockNumber(buffer);
276 : OffsetNumber offnum,
277 : maxoff;
278 : PruneState prstate;
279 : HeapTupleData tup;
280 :
281 : /*
282 : * Our strategy is to scan the page and make lists of items to change,
283 : * then apply the changes within a critical section. This keeps as much
284 : * logic as possible out of the critical section, and also ensures that
285 : * WAL replay will work the same as the normal case.
286 : *
287 : * First, initialize the new pd_prune_xid value to zero (indicating no
288 : * prunable tuples). If we find any tuples which may soon become
289 : * prunable, we will save the lowest relevant XID in new_prune_xid. Also
290 : * initialize the rest of our working state.
291 : */
292 268918 : prstate.new_prune_xid = InvalidTransactionId;
293 268918 : prstate.rel = relation;
294 268918 : prstate.vistest = vistest;
295 268918 : prstate.old_snap_xmin = old_snap_xmin;
296 268918 : prstate.old_snap_ts = old_snap_ts;
297 268918 : prstate.old_snap_used = false;
298 GNC 268918 : prstate.snapshotConflictHorizon = InvalidTransactionId;
299 CBC 268918 : prstate.nredirected = prstate.ndead = prstate.nunused = 0;
300 268918 : memset(prstate.marked, 0, sizeof(prstate.marked));
301 :
302 268918 : maxoff = PageGetMaxOffsetNumber(page);
303 268918 : tup.t_tableOid = RelationGetRelid(prstate.rel);
304 :
305 : /*
306 : * Determine HTSV for all tuples.
307 : *
308 : * This is required for correctness to deal with cases where running HTSV
309 : * twice could result in different results (e.g. RECENTLY_DEAD can turn to
310 : * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
311 : * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
312 : * inserting transaction aborts, ...). That in turn could cause
313 : * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
314 : * once directly via a heap_prune_chain() and once following a HOT chain.
315 : *
316 : * It's also good for performance. Most commonly tuples within a page are
317 : * stored at decreasing offsets (while the items are stored at increasing
318 : * offsets). When processing all tuples on a page this leads to reading
319 : * memory at decreasing offsets within a page, with a variable stride.
320 : * That's hard for CPU prefetchers to deal with. Processing the items in
321 : * reverse order (and thus the tuples in increasing order) increases
322 : * prefetching efficiency significantly / decreases the number of cache
323 : * misses.
324 : */
325 268918 : for (offnum = maxoff;
326 16690039 : offnum >= FirstOffsetNumber;
327 16421121 : offnum = OffsetNumberPrev(offnum))
328 : {
329 16421121 : ItemId itemid = PageGetItemId(page, offnum);
330 : HeapTupleHeader htup;
331 :
332 : /* Nothing to do if slot doesn't contain a tuple */
333 16421121 : if (!ItemIdIsNormal(itemid))
334 : {
335 1683379 : prstate.htsv[offnum] = -1;
336 1683379 : continue;
337 : }
338 :
339 14737742 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
340 14737742 : tup.t_data = htup;
341 14737742 : tup.t_len = ItemIdGetLength(itemid);
342 14737742 : ItemPointerSet(&(tup.t_self), blockno, offnum);
343 :
344 : /*
345 : * Set the offset number so that we can display it along with any
346 : * error that occurred while processing this tuple.
347 : */
348 14737742 : if (off_loc)
349 9682613 : *off_loc = offnum;
350 :
351 14737742 : prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
352 : buffer);
353 : }
354 :
355 : /* Scan the page */
356 268918 : for (offnum = FirstOffsetNumber;
357 16690039 : offnum <= maxoff;
358 16421121 : offnum = OffsetNumberNext(offnum))
359 : {
360 : ItemId itemid;
361 :
362 : /* Ignore items already processed as part of an earlier chain */
363 16421121 : if (prstate.marked[offnum])
364 193811 : continue;
365 :
366 : /* see preceding loop */
367 16227310 : if (off_loc)
368 10145699 : *off_loc = offnum;
369 :
370 : /* Nothing to do if slot is empty or already dead */
371 16227310 : itemid = PageGetItemId(page, offnum);
372 16227310 : if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
373 1098407 : continue;
374 :
375 : /* Process this item or chain of items */
376 15128903 : ndeleted += heap_prune_chain(buffer, offnum, &prstate);
377 : }
378 :
379 : /* Clear the offset information once we have processed the given page. */
380 268918 : if (off_loc)
381 165089 : *off_loc = InvalidOffsetNumber;
382 :
383 : /* Any error while applying the changes is critical */
384 268918 : START_CRIT_SECTION();
385 :
386 : /* Have we found any prunable items? */
387 268918 : if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
388 : {
389 : /*
390 : * Apply the planned item changes, then repair page fragmentation, and
391 : * update the page's hint bit about whether it has free line pointers.
392 : */
393 112841 : heap_page_prune_execute(buffer,
394 : prstate.redirected, prstate.nredirected,
395 : prstate.nowdead, prstate.ndead,
396 : prstate.nowunused, prstate.nunused);
397 :
398 : /*
399 : * Update the page's pd_prune_xid field to either zero, or the lowest
400 : * XID of any soon-prunable tuple.
401 : */
402 112841 : ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
403 :
404 : /*
405 : * Also clear the "page is full" flag, since there's no point in
406 : * repeating the prune/defrag process until something else happens to
407 : * the page.
408 : */
409 112841 : PageClearFull(page);
410 :
411 112841 : MarkBufferDirty(buffer);
412 :
413 : /*
414 : * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
415 : */
416 112841 : if (RelationNeedsWAL(relation))
417 : {
418 : xl_heap_prune xlrec;
419 : XLogRecPtr recptr;
420 :
421 GNC 112027 : xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation);
422 112027 : xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon;
423 CBC 112027 : xlrec.nredirected = prstate.nredirected;
424 112027 : xlrec.ndead = prstate.ndead;
425 ECB :
426 GIC 112027 : XLogBeginInsert();
427 CBC 112027 : XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
428 ECB :
429 GIC 112027 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
430 ECB :
431 : /*
432 : * The OffsetNumber arrays are not actually in the buffer, but we
433 : * pretend that they are. When XLogInsert stores the whole
434 : * buffer, the offset arrays need not be stored too.
435 : */
436 GIC 112027 : if (prstate.nredirected > 0)
437 CBC 53622 : XLogRegisterBufData(0, (char *) prstate.redirected,
438 53622 : prstate.nredirected *
439 ECB : sizeof(OffsetNumber) * 2);
440 :
441 GIC 112027 : if (prstate.ndead > 0)
442 CBC 62485 : XLogRegisterBufData(0, (char *) prstate.nowdead,
443 62485 : prstate.ndead * sizeof(OffsetNumber));
444 ECB :
445 GIC 112027 : if (prstate.nunused > 0)
446 CBC 17957 : XLogRegisterBufData(0, (char *) prstate.nowunused,
447 17957 : prstate.nunused * sizeof(OffsetNumber));
448 ECB :
449 GIC 112027 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
450 ECB :
451 GIC 112027 : PageSetLSN(BufferGetPage(buffer), recptr);
452 ECB : }
453 : }
454 : else
455 : {
456 : /*
457 : * If we didn't prune anything, but have found a new value for the
458 : * pd_prune_xid field, update it and mark the buffer dirty. This is
459 : * treated as a non-WAL-logged hint.
460 : *
461 : * Also clear the "page is full" flag if it is set, since there's no
462 : * point in repeating the prune/defrag process until something else
463 : * happens to the page.
464 : */
465 GIC 312060 : if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
466 CBC 155983 : PageIsFull(page))
467 ECB : {
468 GIC 192 : ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
469 CBC 192 : PageClearFull(page);
470 192 : MarkBufferDirtyHint(buffer, true);
471 ECB : }
472 : }
473 :
474 GIC 268918 : END_CRIT_SECTION();
475 ECB :
476 : /* Record number of newly-set-LP_DEAD items for caller */
477 GIC 268918 : *nnewlpdead = prstate.ndead;
478 ECB :
479 GIC 268918 : return ndeleted;
480 ECB : }
481 :
482 :
483 : /*
484 : * Perform visibility checks for heap pruning.
485 : *
486 : * This is more complicated than just using GlobalVisTestIsRemovableXid()
487 : * because of old_snapshot_threshold. We only want to increase the threshold
488 : * that triggers errors for old snapshots when we actually decide to remove a
489 : * row based on the limited horizon.
490 : *
491 : * Due to its cost we also only want to call
492 : * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
493 : * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
494 : * still want to be able to remove rows that are too new to be removed
495 : * according to prstate->vistest, but that can be removed based on
496 : * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
497 : * demand in here, if appropriate.
498 : */
499 : static HTSV_Result
500 GIC 14737742 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
501 ECB : {
502 : HTSV_Result res;
503 : TransactionId dead_after;
504 :
505 GIC 14737742 : res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
506 ECB :
507 GIC 14737742 : if (res != HEAPTUPLE_RECENTLY_DEAD)
508 CBC 12938086 : return res;
509 ECB :
510 : /*
511 : * If we are already relying on the limited xmin, there is no need to
512 : * delay doing so anymore.
513 : */
514 GIC 1799656 : if (prstate->old_snap_used)
515 ECB : {
516 UIC 0 : Assert(TransactionIdIsValid(prstate->old_snap_xmin));
517 EUB :
518 UIC 0 : if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
519 UBC 0 : res = HEAPTUPLE_DEAD;
520 0 : return res;
521 EUB : }
522 :
523 : /*
524 : * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
525 : * row dead. If not, and old_snapshot_threshold is enabled, try to use the
526 : * lowered horizon.
527 : */
528 GIC 1799656 : if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
529 CBC 1459724 : res = HEAPTUPLE_DEAD;
530 339932 : else if (OldSnapshotThresholdActive())
531 ECB : {
532 : /* haven't determined limited horizon yet, requests */
533 UIC 0 : if (!TransactionIdIsValid(prstate->old_snap_xmin))
534 EUB : {
535 : TransactionId horizon =
536 UIC 0 : GlobalVisTestNonRemovableHorizon(prstate->vistest);
537 EUB :
538 UIC 0 : TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
539 EUB : &prstate->old_snap_xmin,
540 : &prstate->old_snap_ts);
541 : }
542 :
543 UIC 0 : if (TransactionIdIsValid(prstate->old_snap_xmin) &&
544 UBC 0 : TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
545 EUB : {
546 : /*
547 : * About to remove row based on snapshot_too_old. Need to raise
548 : * the threshold so problematic accesses would error.
549 : */
550 UIC 0 : Assert(!prstate->old_snap_used);
551 UBC 0 : SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
552 EUB : prstate->old_snap_xmin);
553 UIC 0 : prstate->old_snap_used = true;
554 UBC 0 : res = HEAPTUPLE_DEAD;
555 EUB : }
556 : }
557 :
558 GIC 1799656 : return res;
559 ECB : }
560 :
561 :
562 : /*
563 : * Prune specified line pointer or a HOT chain originating at line pointer.
564 : *
565 : * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
566 : * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
567 : * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
568 : * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
569 : * DEAD, our visibility test is just too coarse to detect it.
570 : *
571 : * In general, pruning must never leave behind a DEAD tuple that still has
572 : * tuple storage. VACUUM isn't prepared to deal with that case. That's why
573 : * VACUUM prunes the same heap page a second time (without dropping its lock
574 : * in the interim) when it sees a newly DEAD tuple that we initially saw as
575 : * in-progress. Retrying pruning like this can only happen when an inserting
576 : * transaction concurrently aborts.
577 : *
578 : * The root line pointer is redirected to the tuple immediately after the
579 : * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
580 : * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
581 : * tuple, which we treat as a chain of length 1.)
582 : *
583 : * We don't actually change the page here. We just add entries to the arrays in
584 : * prstate showing the changes to be made. Items to be redirected are added
585 : * to the redirected[] array (two entries per redirection); items to be set to
586 : * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
587 : * state are added to nowunused[].
588 : *
589 : * Returns the number of tuples (to be) deleted from the page.
590 : */
591 : static int
592 GIC 15128903 : heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
593 ECB : {
594 GIC 15128903 : int ndeleted = 0;
595 CBC 15128903 : Page dp = (Page) BufferGetPage(buffer);
596 15128903 : TransactionId priorXmax = InvalidTransactionId;
597 ECB : ItemId rootlp;
598 : HeapTupleHeader htup;
599 GIC 15128903 : OffsetNumber latestdead = InvalidOffsetNumber,
600 CBC 15128903 : maxoff = PageGetMaxOffsetNumber(dp),
601 ECB : offnum;
602 : OffsetNumber chainitems[MaxHeapTuplesPerPage];
603 GIC 15128903 : int nchain = 0,
604 ECB : i;
605 :
606 GIC 15128903 : rootlp = PageGetItemId(dp, rootoffnum);
607 ECB :
608 : /*
609 : * If it's a heap-only tuple, then it is not the start of a HOT chain.
610 : */
611 GIC 15128903 : if (ItemIdIsNormal(rootlp))
612 ECB : {
613 GIC 14543931 : Assert(prstate->htsv[rootoffnum] != -1);
614 CBC 14543931 : htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
615 ECB :
616 GIC 14543931 : if (HeapTupleHeaderIsHeapOnly(htup))
617 ECB : {
618 : /*
619 : * If the tuple is DEAD and doesn't chain to anything else, mark
620 : * it unused immediately. (If it does chain, we can only remove
621 : * it as part of pruning its chain.)
622 : *
623 : * We need this primarily to handle aborted HOT updates, that is,
624 : * XMIN_INVALID heap-only tuples. Those might not be linked to by
625 : * any chain, since the parent tuple might be re-updated before
626 : * any pruning occurs. So we have to be able to reap them
627 : * separately from chain-pruning. (Note that
628 : * HeapTupleHeaderIsHotUpdated will never return true for an
629 : * XMIN_INVALID tuple, so this code will work even when there were
630 : * sequential updates within the aborted transaction.)
631 : *
632 : * Note that we might first arrive at a dead heap-only tuple
633 : * either here or while following a chain below. Whichever path
634 : * gets there first will mark the tuple unused.
635 : */
636 GIC 579696 : if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
637 CBC 7238 : !HeapTupleHeaderIsHotUpdated(htup))
638 ECB : {
639 GIC 3132 : heap_prune_record_unused(prstate, rootoffnum);
640 GNC 3132 : HeapTupleHeaderAdvanceConflictHorizon(htup,
641 : &prstate->snapshotConflictHorizon);
642 GIC 3132 : ndeleted++;
643 ECB : }
644 :
645 : /* Nothing more to do */
646 GIC 579696 : return ndeleted;
647 ECB : }
648 : }
649 :
650 : /* Start from the root tuple */
651 GIC 14549207 : offnum = rootoffnum;
652 ECB :
653 : /* while not end of the chain */
654 : for (;;)
655 GIC 761000 : {
656 ECB : ItemId lp;
657 : bool tupdead,
658 : recent_dead;
659 :
660 : /* Sanity check (pure paranoia) */
661 GIC 15310207 : if (offnum < FirstOffsetNumber)
662 LBC 0 : break;
663 EUB :
664 : /*
665 : * An offset past the end of page's line pointer array is possible
666 : * when the array was truncated (original item must have been unused)
667 : */
668 GIC 15310207 : if (offnum > maxoff)
669 LBC 0 : break;
670 EUB :
671 : /* If item is already processed, stop --- it must not be same chain */
672 GIC 15310207 : if (prstate->marked[offnum])
673 CBC 1907 : break;
674 ECB :
675 GIC 15308300 : lp = PageGetItemId(dp, offnum);
676 ECB :
677 : /* Unused item obviously isn't part of the chain */
678 GIC 15308300 : if (!ItemIdIsUsed(lp))
679 LBC 0 : break;
680 EUB :
681 : /*
682 : * If we are looking at the redirected root line pointer, jump to the
683 : * first normal tuple in the chain. If we find a redirect somewhere
684 : * else, stop --- it must not be same chain.
685 : */
686 GIC 15308300 : if (ItemIdIsRedirected(lp))
687 ECB : {
688 GIC 584972 : if (nchain > 0)
689 LBC 0 : break; /* not at start of chain */
690 GBC 584972 : chainitems[nchain++] = offnum;
691 CBC 584972 : offnum = ItemIdGetRedirect(rootlp);
692 584972 : continue;
693 ECB : }
694 :
695 : /*
696 : * Likewise, a dead line pointer can't be part of the chain. (We
697 : * already eliminated the case of dead root tuple outside this
698 : * function.)
699 : */
700 GIC 14723328 : if (ItemIdIsDead(lp))
701 LBC 0 : break;
702 EUB :
703 GIC 14723328 : Assert(ItemIdIsNormal(lp));
704 CBC 14723328 : Assert(prstate->htsv[offnum] != -1);
705 14723328 : htup = (HeapTupleHeader) PageGetItem(dp, lp);
706 ECB :
707 : /*
708 : * Check the tuple XMIN against prior XMAX, if any
709 : */
710 GIC 14898367 : if (TransactionIdIsValid(priorXmax) &&
711 CBC 175039 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
712 LBC 0 : break;
713 EUB :
714 : /*
715 : * OK, this tuple is indeed a member of the chain.
716 : */
717 GIC 14723328 : chainitems[nchain++] = offnum;
718 ECB :
719 : /*
720 : * Check tuple's visibility status.
721 : */
722 GIC 14723328 : tupdead = recent_dead = false;
723 ECB :
724 GIC 14723328 : switch ((HTSV_Result) prstate->htsv[offnum])
725 ECB : {
726 GIC 1476657 : case HEAPTUPLE_DEAD:
727 CBC 1476657 : tupdead = true;
728 1476657 : break;
729 ECB :
730 GIC 339932 : case HEAPTUPLE_RECENTLY_DEAD:
731 CBC 339932 : recent_dead = true;
732 ECB :
733 : /*
734 : * This tuple may soon become DEAD. Update the hint field so
735 : * that the page is reconsidered for pruning in future.
736 : */
737 GIC 339932 : heap_prune_record_prunable(prstate,
738 CBC 339932 : HeapTupleHeaderGetUpdateXid(htup));
739 339932 : break;
740 ECB :
741 GIC 14271 : case HEAPTUPLE_DELETE_IN_PROGRESS:
742 ECB :
743 : /*
744 : * This tuple may soon become DEAD. Update the hint field so
745 : * that the page is reconsidered for pruning in future.
746 : */
747 GIC 14271 : heap_prune_record_prunable(prstate,
748 CBC 14271 : HeapTupleHeaderGetUpdateXid(htup));
749 14271 : break;
750 ECB :
751 GIC 12892468 : case HEAPTUPLE_LIVE:
752 ECB : case HEAPTUPLE_INSERT_IN_PROGRESS:
753 :
754 : /*
755 : * If we wanted to optimize for aborts, we might consider
756 : * marking the page prunable when we see INSERT_IN_PROGRESS.
757 : * But we don't. See related decisions about when to mark the
758 : * page prunable in heapam.c.
759 : */
760 GIC 12892468 : break;
761 ECB :
762 UIC 0 : default:
763 UBC 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
764 EUB : break;
765 : }
766 :
767 : /*
768 : * Remember the last DEAD tuple seen. We will advance past
769 : * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
770 : * but we can't advance past anything else. We have to make sure that
771 : * we don't miss any DEAD tuples, since DEAD tuples that still have
772 : * tuple storage after pruning will confuse VACUUM.
773 : */
774 GIC 14723328 : if (tupdead)
775 ECB : {
776 GIC 1476657 : latestdead = offnum;
777 GNC 1476657 : HeapTupleHeaderAdvanceConflictHorizon(htup,
778 : &prstate->snapshotConflictHorizon);
779 : }
780 GIC 13246671 : else if (!recent_dead)
781 CBC 12906739 : break;
782 ECB :
783 : /*
784 : * If the tuple is not HOT-updated, then we are at the end of this
785 : * HOT-update chain.
786 : */
787 GIC 1816589 : if (!HeapTupleHeaderIsHotUpdated(htup))
788 ECB : break;
789 :
790 : /* HOT implies it can't have moved to different partition */
791 GIC 176028 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
792 ECB :
793 : /*
794 : * Advance to next chain member.
795 : */
796 GIC 176028 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
797 ECB : BufferGetBlockNumber(buffer));
798 GIC 176028 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
799 CBC 176028 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
800 ECB : }
801 :
802 : /*
803 : * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
804 : * the DEAD tuples at the start of the chain are removed and the root line
805 : * pointer is appropriately redirected.
806 : */
807 GIC 14549207 : if (OffsetNumberIsValid(latestdead))
808 ECB : {
809 : /*
810 : * Mark as unused each intermediate item that we are able to remove
811 : * from the chain.
812 : *
813 : * When the previous item is the last dead tuple seen, we are at the
814 : * right candidate for redirection.
815 : */
816 GIC 1509961 : for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
817 ECB : {
818 GIC 74717 : heap_prune_record_unused(prstate, chainitems[i]);
819 CBC 74717 : ndeleted++;
820 ECB : }
821 :
822 : /*
823 : * If the root entry had been a normal tuple, we are deleting it, so
824 : * count it in the result. But changing a redirect (even to DEAD
825 : * state) doesn't count.
826 : */
827 GIC 1435244 : if (ItemIdIsNormal(rootlp))
828 CBC 1401940 : ndeleted++;
829 ECB :
830 : /*
831 : * If the DEAD tuple is at the end of the chain, the entire chain is
832 : * dead and the root line pointer can be marked dead. Otherwise just
833 : * redirect the root to the correct chain member.
834 : */
835 GIC 1435244 : if (i >= nchain)
836 CBC 1305612 : heap_prune_record_dead(prstate, rootoffnum);
837 ECB : else
838 GIC 129632 : heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
839 ECB : }
840 GIC 13113963 : else if (nchain < 2 && ItemIdIsRedirected(rootlp))
841 ECB : {
842 : /*
843 : * We found a redirect item that doesn't point to a valid follow-on
844 : * item. This can happen if the loop in heap_page_prune caused us to
845 : * visit the dead successor of a redirect item before visiting the
846 : * redirect item. We can clean up by setting the redirect item to
847 : * DEAD state.
848 : */
849 GIC 918 : heap_prune_record_dead(prstate, rootoffnum);
850 ECB : }
851 :
852 GIC 14549207 : return ndeleted;
853 ECB : }
854 :
855 : /* Record lowest soon-prunable XID */
856 : static void
857 GIC 354203 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
858 ECB : {
859 : /*
860 : * This should exactly match the PageSetPrunable macro. We can't store
861 : * directly into the page header yet, so we update working state.
862 : */
863 GIC 354203 : Assert(TransactionIdIsNormal(xid));
864 CBC 698363 : if (!TransactionIdIsValid(prstate->new_prune_xid) ||
865 344160 : TransactionIdPrecedes(xid, prstate->new_prune_xid))
866 11185 : prstate->new_prune_xid = xid;
867 354203 : }
868 ECB :
869 : /* Record line pointer to be redirected */
870 : static void
871 GIC 129632 : heap_prune_record_redirect(PruneState *prstate,
872 ECB : OffsetNumber offnum, OffsetNumber rdoffnum)
873 : {
874 GIC 129632 : Assert(prstate->nredirected < MaxHeapTuplesPerPage);
875 CBC 129632 : prstate->redirected[prstate->nredirected * 2] = offnum;
876 129632 : prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
877 129632 : prstate->nredirected++;
878 129632 : Assert(!prstate->marked[offnum]);
879 129632 : prstate->marked[offnum] = true;
880 129632 : Assert(!prstate->marked[rdoffnum]);
881 129632 : prstate->marked[rdoffnum] = true;
882 129632 : }
883 ECB :
884 : /* Record line pointer to be marked dead */
885 : static void
886 GIC 1306530 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
887 ECB : {
888 GIC 1306530 : Assert(prstate->ndead < MaxHeapTuplesPerPage);
889 CBC 1306530 : prstate->nowdead[prstate->ndead] = offnum;
890 1306530 : prstate->ndead++;
891 1306530 : Assert(!prstate->marked[offnum]);
892 1306530 : prstate->marked[offnum] = true;
893 1306530 : }
894 ECB :
895 : /* Record line pointer to be marked unused */
896 : static void
897 GIC 77849 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
898 ECB : {
899 GIC 77849 : Assert(prstate->nunused < MaxHeapTuplesPerPage);
900 CBC 77849 : prstate->nowunused[prstate->nunused] = offnum;
901 77849 : prstate->nunused++;
902 77849 : Assert(!prstate->marked[offnum]);
903 77849 : prstate->marked[offnum] = true;
904 77849 : }
905 ECB :
906 :
907 : /*
908 : * Perform the actual page changes needed by heap_page_prune.
909 : * It is expected that the caller has a full cleanup lock on the
910 : * buffer.
911 : */
912 : void
913 GIC 118566 : heap_page_prune_execute(Buffer buffer,
914 ECB : OffsetNumber *redirected, int nredirected,
915 : OffsetNumber *nowdead, int ndead,
916 : OffsetNumber *nowunused, int nunused)
917 : {
918 GIC 118566 : Page page = (Page) BufferGetPage(buffer);
919 ECB : OffsetNumber *offnum;
920 : HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
921 :
922 : /* Shouldn't be called unless there's something to do */
923 GIC 118566 : Assert(nredirected > 0 || ndead > 0 || nunused > 0);
924 ECB :
925 : /* Update all redirected line pointers */
926 GIC 118566 : offnum = redirected;
927 CBC 261341 : for (int i = 0; i < nredirected; i++)
928 ECB : {
929 GIC 142775 : OffsetNumber fromoff = *offnum++;
930 CBC 142775 : OffsetNumber tooff = *offnum++;
931 142775 : ItemId fromlp = PageGetItemId(page, fromoff);
932 ECB : ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
933 :
934 : #ifdef USE_ASSERT_CHECKING
935 :
936 : /*
937 : * Any existing item that we set as an LP_REDIRECT (any 'from' item)
938 : * must be the first item from a HOT chain. If the item has tuple
939 : * storage then it can't be a heap-only tuple. Otherwise we are just
940 : * maintaining an existing LP_REDIRECT from an existing HOT chain that
941 : * has been pruned at least once before now.
942 : */
943 GIC 142775 : if (!ItemIdIsRedirected(fromlp))
944 ECB : {
945 GIC 132479 : Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
946 ECB :
947 GIC 132479 : htup = (HeapTupleHeader) PageGetItem(page, fromlp);
948 CBC 132479 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
949 ECB : }
950 : else
951 : {
952 : /* We shouldn't need to redundantly set the redirect */
953 GIC 10296 : Assert(ItemIdGetRedirect(fromlp) != tooff);
954 ECB : }
955 :
956 : /*
957 : * The item that we're about to set as an LP_REDIRECT (the 'from'
958 : * item) will point to an existing item (the 'to' item) that is
959 : * already a heap-only tuple. There can be at most one LP_REDIRECT
960 : * item per HOT chain.
961 : *
962 : * We need to keep around an LP_REDIRECT item (after original
963 : * non-heap-only root tuple gets pruned away) so that it's always
964 : * possible for VACUUM to easily figure out what TID to delete from
965 : * indexes when an entire HOT chain becomes dead. A heap-only tuple
966 : * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
967 : * tuple can.
968 : *
969 : * This check may miss problems, e.g. the target of a redirect could
970 : * be marked as unused subsequently. The page_verify_redirects() check
971 : * below will catch such problems.
972 : */
973 GIC 142775 : tolp = PageGetItemId(page, tooff);
974 CBC 142775 : Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
975 142775 : htup = (HeapTupleHeader) PageGetItem(page, tolp);
976 142775 : Assert(HeapTupleHeaderIsHeapOnly(htup));
977 ECB : #endif
978 :
979 GIC 142775 : ItemIdSetRedirect(fromlp, tooff);
980 ECB : }
981 :
982 : /* Update all now-dead line pointers */
983 GIC 118566 : offnum = nowdead;
984 CBC 1685949 : for (int i = 0; i < ndead; i++)
985 ECB : {
986 GIC 1567383 : OffsetNumber off = *offnum++;
987 CBC 1567383 : ItemId lp = PageGetItemId(page, off);
988 ECB :
989 : #ifdef USE_ASSERT_CHECKING
990 :
991 : /*
992 : * An LP_DEAD line pointer must be left behind when the original item
993 : * (which is dead to everybody) could still be referenced by a TID in
994 : * an index. This should never be necessary with any individual
995 : * heap-only tuple item, though. (It's not clear how much of a problem
996 : * that would be, but there is no reason to allow it.)
997 : */
998 GIC 1567383 : if (ItemIdHasStorage(lp))
999 ECB : {
1000 GIC 1542159 : Assert(ItemIdIsNormal(lp));
1001 CBC 1542159 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1002 1542159 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
1003 ECB : }
1004 : else
1005 : {
1006 : /* Whole HOT chain becomes dead */
1007 GIC 25224 : Assert(ItemIdIsRedirected(lp));
1008 ECB : }
1009 : #endif
1010 :
1011 GIC 1567383 : ItemIdSetDead(lp);
1012 ECB : }
1013 :
1014 : /* Update all now-unused line pointers */
1015 GIC 118566 : offnum = nowunused;
1016 CBC 203209 : for (int i = 0; i < nunused; i++)
1017 ECB : {
1018 GIC 84643 : OffsetNumber off = *offnum++;
1019 CBC 84643 : ItemId lp = PageGetItemId(page, off);
1020 ECB :
1021 : #ifdef USE_ASSERT_CHECKING
1022 :
1023 : /*
1024 : * Only heap-only tuples can become LP_UNUSED during pruning. They
1025 : * don't need to be left in place as LP_DEAD items until VACUUM gets
1026 : * around to doing index vacuuming.
1027 : */
1028 GIC 84643 : Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
1029 CBC 84643 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1030 84643 : Assert(HeapTupleHeaderIsHeapOnly(htup));
1031 ECB : #endif
1032 :
1033 GIC 84643 : ItemIdSetUnused(lp);
1034 ECB : }
1035 :
1036 : /*
1037 : * Finally, repair any fragmentation, and update the page's hint bit about
1038 : * whether it has free pointers.
1039 : */
1040 GIC 118566 : PageRepairFragmentation(page);
1041 ECB :
1042 : /*
1043 : * Now that the page has been modified, assert that redirect items still
1044 : * point to valid targets.
1045 : */
1046 GIC 118566 : page_verify_redirects(page);
1047 CBC 118566 : }
1048 ECB :
1049 :
1050 : /*
1051 : * If built with assertions, verify that all LP_REDIRECT items point to a
1052 : * valid item.
1053 : *
1054 : * One way that bugs related to HOT pruning show is redirect items pointing to
1055 : * removed tuples. It's not trivial to reliably check that marking an item
1056 : * unused will not orphan a redirect item during heap_prune_chain() /
1057 : * heap_page_prune_execute(), so we additionally check the whole page after
1058 : * pruning. Without this check such bugs would typically only cause asserts
1059 : * later, potentially well after the corruption has been introduced.
1060 : *
1061 : * Also check comments in heap_page_prune_execute()'s redirection loop.
1062 : */
1063 : static void
1064 GIC 118566 : page_verify_redirects(Page page)
1065 ECB : {
1066 : #ifdef USE_ASSERT_CHECKING
1067 : OffsetNumber offnum;
1068 : OffsetNumber maxoff;
1069 :
1070 GIC 118566 : maxoff = PageGetMaxOffsetNumber(page);
1071 CBC 118566 : for (offnum = FirstOffsetNumber;
1072 7809329 : offnum <= maxoff;
1073 7690763 : offnum = OffsetNumberNext(offnum))
1074 ECB : {
1075 GIC 7690763 : ItemId itemid = PageGetItemId(page, offnum);
1076 ECB : OffsetNumber targoff;
1077 : ItemId targitem;
1078 : HeapTupleHeader htup;
1079 :
1080 GIC 7690763 : if (!ItemIdIsRedirected(itemid))
1081 CBC 7044702 : continue;
1082 ECB :
1083 GIC 646061 : targoff = ItemIdGetRedirect(itemid);
1084 CBC 646061 : targitem = PageGetItemId(page, targoff);
1085 ECB :
1086 GIC 646061 : Assert(ItemIdIsUsed(targitem));
1087 CBC 646061 : Assert(ItemIdIsNormal(targitem));
1088 646061 : Assert(ItemIdHasStorage(targitem));
1089 646061 : htup = (HeapTupleHeader) PageGetItem(page, targitem);
1090 646061 : Assert(HeapTupleHeaderIsHeapOnly(htup));
1091 ECB : }
1092 : #endif
1093 GIC 118566 : }
1094 ECB :
1095 :
1096 : /*
1097 : * For all items in this page, find their respective root line pointers.
1098 : * If item k is part of a HOT-chain with root at item j, then we set
1099 : * root_offsets[k - 1] = j.
1100 : *
1101 : * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
1102 : * Unused entries are filled with InvalidOffsetNumber (zero).
1103 : *
1104 : * The function must be called with at least share lock on the buffer, to
1105 : * prevent concurrent prune operations.
1106 : *
1107 : * Note: The information collected here is valid only as long as the caller
1108 : * holds a pin on the buffer. Once pin is released, a tuple might be pruned
1109 : * and reused by a completely unrelated tuple.
1110 : */
1111 : void
1112 GIC 204041 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
1113 ECB : {
1114 : OffsetNumber offnum,
1115 : maxoff;
1116 :
1117 GIC 204041 : MemSet(root_offsets, InvalidOffsetNumber,
1118 ECB : MaxHeapTuplesPerPage * sizeof(OffsetNumber));
1119 :
1120 GIC 204041 : maxoff = PageGetMaxOffsetNumber(page);
1121 CBC 14254285 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
1122 ECB : {
1123 GIC 14050244 : ItemId lp = PageGetItemId(page, offnum);
1124 ECB : HeapTupleHeader htup;
1125 : OffsetNumber nextoffnum;
1126 : TransactionId priorXmax;
1127 :
1128 : /* skip unused and dead items */
1129 GIC 14050244 : if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
1130 CBC 2932 : continue;
1131 ECB :
1132 GIC 14047312 : if (ItemIdIsNormal(lp))
1133 ECB : {
1134 GIC 14045874 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1135 ECB :
1136 : /*
1137 : * Check if this tuple is part of a HOT-chain rooted at some other
1138 : * tuple. If so, skip it for now; we'll process it when we find
1139 : * its root.
1140 : */
1141 GIC 14045874 : if (HeapTupleHeaderIsHeapOnly(htup))
1142 CBC 1664 : continue;
1143 ECB :
1144 : /*
1145 : * This is either a plain tuple or the root of a HOT-chain.
1146 : * Remember it in the mapping.
1147 : */
1148 GIC 14044210 : root_offsets[offnum - 1] = offnum;
1149 ECB :
1150 : /* If it's not the start of a HOT-chain, we're done with it */
1151 GIC 14044210 : if (!HeapTupleHeaderIsHotUpdated(htup))
1152 CBC 14044030 : continue;
1153 ECB :
1154 : /* Set up to scan the HOT-chain */
1155 GIC 180 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1156 CBC 180 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1157 ECB : }
1158 : else
1159 : {
1160 : /* Must be a redirect item. We do not set its root_offsets entry */
1161 GIC 1438 : Assert(ItemIdIsRedirected(lp));
1162 ECB : /* Set up to scan the HOT-chain */
1163 GIC 1438 : nextoffnum = ItemIdGetRedirect(lp);
1164 CBC 1438 : priorXmax = InvalidTransactionId;
1165 ECB : }
1166 :
1167 : /*
1168 : * Now follow the HOT-chain and collect other tuples in the chain.
1169 : *
1170 : * Note: Even though this is a nested loop, the complexity of the
1171 : * function is O(N) because a tuple in the page should be visited not
1172 : * more than twice, once in the outer loop and once in HOT-chain
1173 : * chases.
1174 : */
1175 : for (;;)
1176 : {
1177 : /* Sanity check (pure paranoia) */
1178 GIC 1664 : if (offnum < FirstOffsetNumber)
1179 LBC 0 : break;
1180 EUB :
1181 : /*
1182 : * An offset past the end of page's line pointer array is possible
1183 : * when the array was truncated
1184 : */
1185 GIC 1664 : if (offnum > maxoff)
1186 LBC 0 : break;
1187 EUB :
1188 GIC 1664 : lp = PageGetItemId(page, nextoffnum);
1189 ECB :
1190 : /* Check for broken chains */
1191 GIC 1664 : if (!ItemIdIsNormal(lp))
1192 LBC 0 : break;
1193 EUB :
1194 GIC 1664 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1195 ECB :
1196 GIC 1890 : if (TransactionIdIsValid(priorXmax) &&
1197 CBC 226 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
1198 LBC 0 : break;
1199 EUB :
1200 : /* Remember the root line pointer for this item */
1201 GIC 1664 : root_offsets[nextoffnum - 1] = offnum;
1202 ECB :
1203 : /* Advance to next chain member, if any */
1204 GIC 1664 : if (!HeapTupleHeaderIsHotUpdated(htup))
1205 ECB : break;
1206 :
1207 : /* HOT implies it can't have moved to different partition */
1208 GIC 46 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1209 ECB :
1210 GIC 46 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
1211 CBC 46 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
1212 ECB : }
1213 : }
1214 GIC 204041 : }
|