Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * pruneheap.c
4 : * heap page pruning and HOT-chain management code
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/heap/pruneheap.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/heapam.h"
18 : #include "access/heapam_xlog.h"
19 : #include "access/htup_details.h"
20 : #include "access/transam.h"
21 : #include "access/xlog.h"
22 : #include "access/xloginsert.h"
23 : #include "catalog/catalog.h"
24 : #include "miscadmin.h"
25 : #include "pgstat.h"
26 : #include "storage/bufmgr.h"
27 : #include "utils/snapmgr.h"
28 : #include "utils/rel.h"
29 : #include "utils/snapmgr.h"
30 :
31 : /* Working data for heap_page_prune and subroutines */
32 : typedef struct
33 : {
34 : Relation rel;
35 :
36 : /* tuple visibility test, initialized for the relation */
37 : GlobalVisState *vistest;
38 :
39 : /*
40 : * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
41 : * been computed (done on demand, and only if
42 : * OldSnapshotThresholdActive()). The first time a tuple is about to be
43 : * removed based on the limited horizon, old_snap_used is set to true, and
44 : * SetOldSnapshotThresholdTimestamp() is called. See
45 : * heap_prune_satisfies_vacuum().
46 : */
47 : TimestampTz old_snap_ts;
48 : TransactionId old_snap_xmin;
49 : bool old_snap_used;
50 :
51 : TransactionId new_prune_xid; /* new prune hint value for page */
52 : TransactionId snapshotConflictHorizon; /* latest xid removed */
53 : int nredirected; /* numbers of entries in arrays below */
54 : int ndead;
55 : int nunused;
56 : /* arrays that accumulate indexes of items to be changed */
57 : OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
58 : OffsetNumber nowdead[MaxHeapTuplesPerPage];
59 : OffsetNumber nowunused[MaxHeapTuplesPerPage];
60 :
61 : /*
62 : * marked[i] is true if item i is entered in one of the above arrays.
63 : *
64 : * This needs to be MaxHeapTuplesPerPage + 1 long as FirstOffsetNumber is
65 : * 1. Otherwise every access would need to subtract 1.
66 : */
67 : bool marked[MaxHeapTuplesPerPage + 1];
68 :
69 : /*
70 : * Tuple visibility is only computed once for each tuple, for correctness
71 : * and efficiency reasons; see comment in heap_page_prune() for details.
72 : * This is of type int8[], instead of HTSV_Result[], so we can use -1 to
73 : * indicate no visibility has been computed, e.g. for LP_DEAD items.
74 : *
75 : * Same indexing as ->marked.
76 : */
77 : int8 htsv[MaxHeapTuplesPerPage + 1];
78 : } PruneState;
79 :
80 : /* Local functions */
81 : static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate,
82 : HeapTuple tup,
83 : Buffer buffer);
84 : static int heap_prune_chain(Buffer buffer,
85 : OffsetNumber rootoffnum,
86 : PruneState *prstate);
87 : static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
88 : static void heap_prune_record_redirect(PruneState *prstate,
89 : OffsetNumber offnum, OffsetNumber rdoffnum);
90 : static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
91 : static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
92 : static void page_verify_redirects(Page page);
93 :
94 :
95 : /*
96 : * Optionally prune and repair fragmentation in the specified page.
97 : *
98 : * This is an opportunistic function. It will perform housekeeping
99 : * only if the page heuristically looks like a candidate for pruning and we
100 : * can acquire buffer cleanup lock without blocking.
101 : *
102 : * Note: this is called quite often. It's important that it fall out quickly
103 : * if there's not any use in pruning.
104 : *
105 : * Caller must have pin on the buffer, and must *not* have a lock on it.
106 : */
107 : void
3324 rhaas 108 CBC 17752776 : heap_page_prune_opt(Relation relation, Buffer buffer)
109 : {
2545 kgrittn 110 17752776 : Page page = BufferGetPage(buffer);
111 : TransactionId prune_xid;
112 : GlobalVisState *vistest;
970 andres 113 17752776 : TransactionId limited_xmin = InvalidTransactionId;
114 17752776 : TimestampTz limited_ts = 0;
115 : Size minfree;
116 :
117 : /*
118 : * We can't write WAL in recovery mode, so there's no point trying to
119 : * clean the page. The primary will likely issue a cleaning WAL record
120 : * soon anyway, so this is no particular loss.
121 : */
3324 rhaas 122 17752776 : if (RecoveryInProgress())
123 16621937 : return;
124 :
125 : /*
126 : * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
127 : * currently are broken, and discussion of what to do about them is
128 : * ongoing. See
129 : * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
130 : */
970 andres 131 17575599 : if (old_snapshot_threshold == 0)
132 2627 : SnapshotTooOldMagicForTest();
133 :
134 : /*
135 : * First check whether there's any chance there's something to prune,
136 : * determining the appropriate horizon is a waste if there's no prune_xid
137 : * (i.e. no updates/deletes left potentially dead tuples around).
138 : */
139 17575599 : prune_xid = ((PageHeader) page)->pd_prune_xid;
140 17575599 : if (!TransactionIdIsValid(prune_xid))
141 10033748 : return;
142 :
143 : /*
144 : * Check whether prune_xid indicates that there may be dead rows that can
145 : * be cleaned up.
146 : *
147 : * It is OK to check the old snapshot limit before acquiring the cleanup
148 : * lock because the worst that can happen is that we are not quite as
149 : * aggressive about the cleanup (by however many transaction IDs are
150 : * consumed between this point and acquiring the lock). This allows us to
151 : * save significant overhead in the case where the page is found not to be
152 : * prunable.
153 : *
154 : * Even if old_snapshot_threshold is set, we first check whether the page
155 : * can be pruned without. Both because
156 : * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
157 : * unnecessarily relying on old_snapshot_threshold avoids causing
158 : * conflicts.
159 : */
160 7541851 : vistest = GlobalVisTestFor(relation);
161 :
162 7541851 : if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
163 : {
164 6410719 : if (!OldSnapshotThresholdActive())
165 6410698 : return;
166 :
167 21 : if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
168 : relation,
169 : &limited_xmin, &limited_ts))
170 19 : return;
171 :
172 2 : if (!TransactionIdPrecedes(prune_xid, limited_xmin))
173 2 : return;
174 : }
175 :
176 : /*
177 : * We prune when a previous UPDATE failed to find enough space on the page
178 : * for a new tuple version, or when free space falls below the relation's
179 : * fill-factor target (but not less than 10%).
180 : *
181 : * Checking free space here is questionable since we aren't holding any
182 : * lock on the buffer; in the worst case we could get a bogus answer. It's
183 : * unlikely to be *seriously* wrong, though, since reading either pd_lower
184 : * or pd_upper is probably atomic. Avoiding taking a lock seems more
185 : * important than sometimes getting a wrong answer in what is after all
186 : * just a heuristic estimate.
187 : */
5680 tgl 188 1131132 : minfree = RelationGetTargetPageFreeSpace(relation,
189 : HEAP_DEFAULT_FILLFACTOR);
190 1131132 : minfree = Max(minfree, BLCKSZ / 10);
191 :
5383 192 1131132 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
193 : {
194 : /* OK, try to get exclusive buffer lock */
5680 195 104122 : if (!ConditionalLockBufferForCleanup(buffer))
196 293 : return;
197 :
198 : /*
199 : * Now that we have buffer lock, get accurate information about the
200 : * page's free space, and recheck the heuristic about whether to
201 : * prune. (We needn't recheck PageIsPrunable, since no one else could
202 : * have pruned while we hold pin.)
203 : */
5383 204 103829 : if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
205 : {
206 : int ndeleted,
207 : nnewlpdead;
208 :
513 pg 209 103829 : ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin,
210 : limited_ts, &nnewlpdead, NULL);
211 :
212 : /*
213 : * Report the number of tuples reclaimed to pgstats. This is
214 : * ndeleted minus the number of newly-LP_DEAD-set items.
215 : *
216 : * We derive the number of dead tuples like this to avoid totally
217 : * forgetting about items that were set to LP_DEAD, since they
218 : * still need to be cleaned up by VACUUM. We only want to count
219 : * heap-only tuples that just became LP_UNUSED in our report,
220 : * which don't.
221 : *
222 : * VACUUM doesn't have to compensate in the same way when it
223 : * tracks ndeleted, since it will set the same LP_DEAD items to
224 : * LP_UNUSED separately.
225 : */
226 103829 : if (ndeleted > nnewlpdead)
227 54279 : pgstat_update_heap_dead_tuples(relation,
228 : ndeleted - nnewlpdead);
229 : }
230 :
231 : /* And release buffer lock */
5680 tgl 232 103829 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
233 :
234 : /*
235 : * We avoid reuse of any free space created on the page by unrelated
236 : * UPDATEs/INSERTs by opting to not update the FSM at this point. The
237 : * free space should be reused by UPDATEs to *this* page.
238 : */
239 : }
240 : }
241 :
242 :
243 : /*
244 : * Prune and repair fragmentation in the specified page.
245 : *
246 : * Caller must have pin and buffer cleanup lock on the page. Note that we
247 : * don't update the FSM information for page on caller's behalf. Caller might
248 : * also need to account for a reduction in the length of the line pointer
249 : * array following array truncation by us.
250 : *
251 : * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
252 : * (see heap_prune_satisfies_vacuum and
253 : * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
254 : * either have been set by TransactionIdLimitedForOldSnapshots, or
255 : * InvalidTransactionId/0 respectively.
256 : *
257 : * Sets *nnewlpdead for caller, indicating the number of items that were
258 : * newly set LP_DEAD during prune operation.
259 : *
260 : * off_loc is the offset location required by the caller to use in error
261 : * callback.
262 : *
263 : * Returns the number of tuples deleted from the page during this call.
264 : */
265 : int
970 andres 266 268918 : heap_page_prune(Relation relation, Buffer buffer,
267 : GlobalVisState *vistest,
268 : TransactionId old_snap_xmin,
269 : TimestampTz old_snap_ts,
270 : int *nnewlpdead,
271 : OffsetNumber *off_loc)
272 : {
5624 bruce 273 268918 : int ndeleted = 0;
2545 kgrittn 274 268918 : Page page = BufferGetPage(buffer);
447 andres 275 268918 : BlockNumber blockno = BufferGetBlockNumber(buffer);
276 : OffsetNumber offnum,
277 : maxoff;
278 : PruneState prstate;
279 : HeapTupleData tup;
280 :
281 : /*
282 : * Our strategy is to scan the page and make lists of items to change,
283 : * then apply the changes within a critical section. This keeps as much
284 : * logic as possible out of the critical section, and also ensures that
285 : * WAL replay will work the same as the normal case.
286 : *
287 : * First, initialize the new pd_prune_xid value to zero (indicating no
288 : * prunable tuples). If we find any tuples which may soon become
289 : * prunable, we will save the lowest relevant XID in new_prune_xid. Also
290 : * initialize the rest of our working state.
291 : */
5510 tgl 292 268918 : prstate.new_prune_xid = InvalidTransactionId;
970 andres 293 268918 : prstate.rel = relation;
294 268918 : prstate.vistest = vistest;
295 268918 : prstate.old_snap_xmin = old_snap_xmin;
296 268918 : prstate.old_snap_ts = old_snap_ts;
297 268918 : prstate.old_snap_used = false;
143 pg 298 GNC 268918 : prstate.snapshotConflictHorizon = InvalidTransactionId;
5510 tgl 299 CBC 268918 : prstate.nredirected = prstate.ndead = prstate.nunused = 0;
300 268918 : memset(prstate.marked, 0, sizeof(prstate.marked));
301 :
5680 302 268918 : maxoff = PageGetMaxOffsetNumber(page);
485 andres 303 268918 : tup.t_tableOid = RelationGetRelid(prstate.rel);
304 :
305 : /*
306 : * Determine HTSV for all tuples.
307 : *
308 : * This is required for correctness to deal with cases where running HTSV
309 : * twice could result in different results (e.g. RECENTLY_DEAD can turn to
310 : * DEAD if another checked item causes GlobalVisTestIsRemovableFullXid()
311 : * to update the horizon, INSERT_IN_PROGRESS can change to DEAD if the
312 : * inserting transaction aborts, ...). That in turn could cause
313 : * heap_prune_chain() to behave incorrectly if a tuple is reached twice,
314 : * once directly via a heap_prune_chain() and once following a HOT chain.
315 : *
316 : * It's also good for performance. Most commonly tuples within a page are
317 : * stored at decreasing offsets (while the items are stored at increasing
318 : * offsets). When processing all tuples on a page this leads to reading
319 : * memory at decreasing offsets within a page, with a variable stride.
320 : * That's hard for CPU prefetchers to deal with. Processing the items in
321 : * reverse order (and thus the tuples in increasing order) increases
322 : * prefetching efficiency significantly / decreases the number of cache
323 : * misses.
324 : */
325 268918 : for (offnum = maxoff;
326 16690039 : offnum >= FirstOffsetNumber;
327 16421121 : offnum = OffsetNumberPrev(offnum))
328 : {
329 16421121 : ItemId itemid = PageGetItemId(page, offnum);
330 : HeapTupleHeader htup;
331 :
332 : /* Nothing to do if slot doesn't contain a tuple */
333 16421121 : if (!ItemIdIsNormal(itemid))
334 : {
335 1683379 : prstate.htsv[offnum] = -1;
336 1683379 : continue;
337 : }
338 :
339 14737742 : htup = (HeapTupleHeader) PageGetItem(page, itemid);
340 14737742 : tup.t_data = htup;
341 14737742 : tup.t_len = ItemIdGetLength(itemid);
447 342 14737742 : ItemPointerSet(&(tup.t_self), blockno, offnum);
343 :
344 : /*
345 : * Set the offset number so that we can display it along with any
346 : * error that occurred while processing this tuple.
347 : */
485 348 14737742 : if (off_loc)
349 9682613 : *off_loc = offnum;
350 :
351 14737742 : prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup,
352 : buffer);
353 : }
354 :
355 : /* Scan the page */
5680 tgl 356 268918 : for (offnum = FirstOffsetNumber;
357 16690039 : offnum <= maxoff;
358 16421121 : offnum = OffsetNumberNext(offnum))
359 : {
360 : ItemId itemid;
361 :
362 : /* Ignore items already processed as part of an earlier chain */
5510 363 16421121 : if (prstate.marked[offnum])
364 193811 : continue;
365 :
366 : /* see preceding loop */
956 akapila 367 16227310 : if (off_loc)
368 10145699 : *off_loc = offnum;
369 :
370 : /* Nothing to do if slot is empty or already dead */
5510 tgl 371 16227310 : itemid = PageGetItemId(page, offnum);
5680 372 16227310 : if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
373 1098407 : continue;
374 :
375 : /* Process this item or chain of items */
970 andres 376 15128903 : ndeleted += heap_prune_chain(buffer, offnum, &prstate);
377 : }
378 :
379 : /* Clear the offset information once we have processed the given page. */
956 akapila 380 268918 : if (off_loc)
381 165089 : *off_loc = InvalidOffsetNumber;
382 :
383 : /* Any error while applying the changes is critical */
5510 tgl 384 268918 : START_CRIT_SECTION();
385 :
386 : /* Have we found any prunable items? */
387 268918 : if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
388 : {
389 : /*
390 : * Apply the planned item changes, then repair page fragmentation, and
391 : * update the page's hint bit about whether it has free line pointers.
392 : */
5414 heikki.linnakangas 393 112841 : heap_page_prune_execute(buffer,
394 : prstate.redirected, prstate.nredirected,
395 : prstate.nowdead, prstate.ndead,
396 : prstate.nowunused, prstate.nunused);
397 :
398 : /*
399 : * Update the page's pd_prune_xid field to either zero, or the lowest
400 : * XID of any soon-prunable tuple.
401 : */
5510 tgl 402 112841 : ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
403 :
404 : /*
405 : * Also clear the "page is full" flag, since there's no point in
406 : * repeating the prune/defrag process until something else happens to
407 : * the page.
408 : */
409 112841 : PageClearFull(page);
410 :
5680 411 112841 : MarkBufferDirty(buffer);
412 :
413 : /*
414 : * Emit a WAL XLOG_HEAP2_PRUNE record showing what we did
415 : */
4500 rhaas 416 112841 : if (RelationNeedsWAL(relation))
417 : {
418 : xl_heap_prune xlrec;
419 : XLogRecPtr recptr;
420 :
7 andres 421 GNC 112027 : xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation);
143 pg 422 112027 : xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon;
733 pg 423 CBC 112027 : xlrec.nredirected = prstate.nredirected;
424 112027 : xlrec.ndead = prstate.ndead;
733 pg 425 ECB :
733 pg 426 GIC 112027 : XLogBeginInsert();
733 pg 427 CBC 112027 : XLogRegisterData((char *) &xlrec, SizeOfHeapPrune);
733 pg 428 ECB :
733 pg 429 GIC 112027 : XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
733 pg 430 ECB :
431 : /*
432 : * The OffsetNumber arrays are not actually in the buffer, but we
433 : * pretend that they are. When XLogInsert stores the whole
434 : * buffer, the offset arrays need not be stored too.
435 : */
733 pg 436 GIC 112027 : if (prstate.nredirected > 0)
733 pg 437 CBC 53622 : XLogRegisterBufData(0, (char *) prstate.redirected,
438 53622 : prstate.nredirected *
733 pg 439 ECB : sizeof(OffsetNumber) * 2);
440 :
733 pg 441 GIC 112027 : if (prstate.ndead > 0)
733 pg 442 CBC 62485 : XLogRegisterBufData(0, (char *) prstate.nowdead,
443 62485 : prstate.ndead * sizeof(OffsetNumber));
733 pg 444 ECB :
733 pg 445 GIC 112027 : if (prstate.nunused > 0)
733 pg 446 CBC 17957 : XLogRegisterBufData(0, (char *) prstate.nowunused,
447 17957 : prstate.nunused * sizeof(OffsetNumber));
733 pg 448 ECB :
733 pg 449 GIC 112027 : recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE);
5510 tgl 450 ECB :
2545 kgrittn 451 GIC 112027 : PageSetLSN(BufferGetPage(buffer), recptr);
5680 tgl 452 ECB : }
453 : }
454 : else
455 : {
456 : /*
457 : * If we didn't prune anything, but have found a new value for the
458 : * pd_prune_xid field, update it and mark the buffer dirty. This is
459 : * treated as a non-WAL-logged hint.
460 : *
461 : * Also clear the "page is full" flag if it is set, since there's no
462 : * point in repeating the prune/defrag process until something else
463 : * happens to the page.
464 : */
5510 tgl 465 GIC 312060 : if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
5510 tgl 466 CBC 155983 : PageIsFull(page))
5510 tgl 467 ECB : {
5510 tgl 468 GIC 192 : ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
5510 tgl 469 CBC 192 : PageClearFull(page);
3583 jdavis 470 192 : MarkBufferDirtyHint(buffer, true);
5510 tgl 471 ECB : }
472 : }
473 :
5680 tgl 474 GIC 268918 : END_CRIT_SECTION();
5680 tgl 475 ECB :
476 : /* Record number of newly-set-LP_DEAD items for caller */
513 pg 477 GIC 268918 : *nnewlpdead = prstate.ndead;
5680 tgl 478 ECB :
5680 tgl 479 GIC 268918 : return ndeleted;
5680 tgl 480 ECB : }
481 :
482 :
483 : /*
484 : * Perform visibility checks for heap pruning.
485 : *
486 : * This is more complicated than just using GlobalVisTestIsRemovableXid()
487 : * because of old_snapshot_threshold. We only want to increase the threshold
488 : * that triggers errors for old snapshots when we actually decide to remove a
489 : * row based on the limited horizon.
490 : *
491 : * Due to its cost we also only want to call
492 : * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
493 : * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
494 : * still want to be able to remove rows that are too new to be removed
495 : * according to prstate->vistest, but that can be removed based on
496 : * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
497 : * demand in here, if appropriate.
498 : */
499 : static HTSV_Result
970 andres 500 GIC 14737742 : heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
970 andres 501 ECB : {
502 : HTSV_Result res;
503 : TransactionId dead_after;
504 :
970 andres 505 GIC 14737742 : res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
970 andres 506 ECB :
970 andres 507 GIC 14737742 : if (res != HEAPTUPLE_RECENTLY_DEAD)
970 andres 508 CBC 12938086 : return res;
970 andres 509 ECB :
510 : /*
511 : * If we are already relying on the limited xmin, there is no need to
512 : * delay doing so anymore.
513 : */
970 andres 514 GIC 1799656 : if (prstate->old_snap_used)
970 andres 515 ECB : {
970 andres 516 UIC 0 : Assert(TransactionIdIsValid(prstate->old_snap_xmin));
970 andres 517 EUB :
970 andres 518 UIC 0 : if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
970 andres 519 UBC 0 : res = HEAPTUPLE_DEAD;
520 0 : return res;
970 andres 521 EUB : }
522 :
523 : /*
524 : * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
525 : * row dead. If not, and old_snapshot_threshold is enabled, try to use the
526 : * lowered horizon.
527 : */
970 andres 528 GIC 1799656 : if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
970 andres 529 CBC 1459724 : res = HEAPTUPLE_DEAD;
530 339932 : else if (OldSnapshotThresholdActive())
970 andres 531 ECB : {
532 : /* haven't determined limited horizon yet, requests */
970 andres 533 UIC 0 : if (!TransactionIdIsValid(prstate->old_snap_xmin))
970 andres 534 EUB : {
535 : TransactionId horizon =
970 andres 536 UIC 0 : GlobalVisTestNonRemovableHorizon(prstate->vistest);
970 andres 537 EUB :
970 andres 538 UIC 0 : TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
970 andres 539 EUB : &prstate->old_snap_xmin,
540 : &prstate->old_snap_ts);
541 : }
542 :
970 andres 543 UIC 0 : if (TransactionIdIsValid(prstate->old_snap_xmin) &&
970 andres 544 UBC 0 : TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
970 andres 545 EUB : {
546 : /*
547 : * About to remove row based on snapshot_too_old. Need to raise
548 : * the threshold so problematic accesses would error.
549 : */
970 andres 550 UIC 0 : Assert(!prstate->old_snap_used);
970 andres 551 UBC 0 : SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
970 andres 552 EUB : prstate->old_snap_xmin);
970 andres 553 UIC 0 : prstate->old_snap_used = true;
970 andres 554 UBC 0 : res = HEAPTUPLE_DEAD;
970 andres 555 EUB : }
556 : }
557 :
970 andres 558 GIC 1799656 : return res;
970 andres 559 ECB : }
560 :
561 :
562 : /*
563 : * Prune specified line pointer or a HOT chain originating at line pointer.
564 : *
565 : * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
566 : * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
567 : * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
568 : * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
569 : * DEAD, our visibility test is just too coarse to detect it.
570 : *
571 : * In general, pruning must never leave behind a DEAD tuple that still has
572 : * tuple storage. VACUUM isn't prepared to deal with that case. That's why
573 : * VACUUM prunes the same heap page a second time (without dropping its lock
574 : * in the interim) when it sees a newly DEAD tuple that we initially saw as
575 : * in-progress. Retrying pruning like this can only happen when an inserting
576 : * transaction concurrently aborts.
577 : *
578 : * The root line pointer is redirected to the tuple immediately after the
579 : * latest DEAD tuple. If all tuples in the chain are DEAD, the root line
580 : * pointer is marked LP_DEAD. (This includes the case of a DEAD simple
581 : * tuple, which we treat as a chain of length 1.)
582 : *
583 : * We don't actually change the page here. We just add entries to the arrays in
584 : * prstate showing the changes to be made. Items to be redirected are added
585 : * to the redirected[] array (two entries per redirection); items to be set to
586 : * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
587 : * state are added to nowunused[].
588 : *
589 : * Returns the number of tuples (to be) deleted from the page.
590 : */
591 : static int
970 andres 592 GIC 15128903 : heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
5680 tgl 593 ECB : {
5624 bruce 594 GIC 15128903 : int ndeleted = 0;
2545 kgrittn 595 CBC 15128903 : Page dp = (Page) BufferGetPage(buffer);
5624 bruce 596 15128903 : TransactionId priorXmax = InvalidTransactionId;
5624 bruce 597 ECB : ItemId rootlp;
598 : HeapTupleHeader htup;
5624 bruce 599 GIC 15128903 : OffsetNumber latestdead = InvalidOffsetNumber,
5624 bruce 600 CBC 15128903 : maxoff = PageGetMaxOffsetNumber(dp),
5624 bruce 601 ECB : offnum;
602 : OffsetNumber chainitems[MaxHeapTuplesPerPage];
5624 bruce 603 GIC 15128903 : int nchain = 0,
5624 bruce 604 ECB : i;
605 :
5680 tgl 606 GIC 15128903 : rootlp = PageGetItemId(dp, rootoffnum);
5680 tgl 607 ECB :
608 : /*
609 : * If it's a heap-only tuple, then it is not the start of a HOT chain.
610 : */
5680 tgl 611 GIC 15128903 : if (ItemIdIsNormal(rootlp))
5680 tgl 612 ECB : {
485 andres 613 GIC 14543931 : Assert(prstate->htsv[rootoffnum] != -1);
5680 tgl 614 CBC 14543931 : htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
3548 rhaas 615 ECB :
5680 tgl 616 GIC 14543931 : if (HeapTupleHeaderIsHeapOnly(htup))
5680 tgl 617 ECB : {
618 : /*
619 : * If the tuple is DEAD and doesn't chain to anything else, mark
620 : * it unused immediately. (If it does chain, we can only remove
621 : * it as part of pruning its chain.)
622 : *
623 : * We need this primarily to handle aborted HOT updates, that is,
624 : * XMIN_INVALID heap-only tuples. Those might not be linked to by
625 : * any chain, since the parent tuple might be re-updated before
626 : * any pruning occurs. So we have to be able to reap them
627 : * separately from chain-pruning. (Note that
628 : * HeapTupleHeaderIsHotUpdated will never return true for an
629 : * XMIN_INVALID tuple, so this code will work even when there were
630 : * sequential updates within the aborted transaction.)
631 : *
632 : * Note that we might first arrive at a dead heap-only tuple
633 : * either here or while following a chain below. Whichever path
634 : * gets there first will mark the tuple unused.
635 : */
485 andres 636 GIC 579696 : if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD &&
485 andres 637 CBC 7238 : !HeapTupleHeaderIsHotUpdated(htup))
5680 tgl 638 ECB : {
5510 tgl 639 GIC 3132 : heap_prune_record_unused(prstate, rootoffnum);
143 pg 640 GNC 3132 : HeapTupleHeaderAdvanceConflictHorizon(htup,
641 : &prstate->snapshotConflictHorizon);
5680 tgl 642 GIC 3132 : ndeleted++;
5680 tgl 643 ECB : }
644 :
645 : /* Nothing more to do */
5680 tgl 646 GIC 579696 : return ndeleted;
5680 tgl 647 ECB : }
648 : }
649 :
650 : /* Start from the root tuple */
5680 tgl 651 GIC 14549207 : offnum = rootoffnum;
5680 tgl 652 ECB :
653 : /* while not end of the chain */
654 : for (;;)
5680 tgl 655 GIC 761000 : {
5624 bruce 656 ECB : ItemId lp;
657 : bool tupdead,
658 : recent_dead;
659 :
660 : /* Sanity check (pure paranoia) */
564 pg 661 GIC 15310207 : if (offnum < FirstOffsetNumber)
564 pg 662 LBC 0 : break;
564 pg 663 EUB :
664 : /*
665 : * An offset past the end of page's line pointer array is possible
666 : * when the array was truncated (original item must have been unused)
667 : */
564 pg 668 GIC 15310207 : if (offnum > maxoff)
5680 tgl 669 LBC 0 : break;
5680 tgl 670 EUB :
671 : /* If item is already processed, stop --- it must not be same chain */
5510 tgl 672 GIC 15310207 : if (prstate->marked[offnum])
5510 tgl 673 CBC 1907 : break;
5510 tgl 674 ECB :
5680 tgl 675 GIC 15308300 : lp = PageGetItemId(dp, offnum);
5680 tgl 676 ECB :
677 : /* Unused item obviously isn't part of the chain */
5680 tgl 678 GIC 15308300 : if (!ItemIdIsUsed(lp))
5680 tgl 679 LBC 0 : break;
5680 tgl 680 EUB :
681 : /*
682 : * If we are looking at the redirected root line pointer, jump to the
683 : * first normal tuple in the chain. If we find a redirect somewhere
684 : * else, stop --- it must not be same chain.
685 : */
5680 tgl 686 GIC 15308300 : if (ItemIdIsRedirected(lp))
5680 tgl 687 ECB : {
5680 tgl 688 GIC 584972 : if (nchain > 0)
5680 tgl 689 LBC 0 : break; /* not at start of chain */
5680 tgl 690 GBC 584972 : chainitems[nchain++] = offnum;
5680 tgl 691 CBC 584972 : offnum = ItemIdGetRedirect(rootlp);
692 584972 : continue;
5680 tgl 693 ECB : }
694 :
695 : /*
696 : * Likewise, a dead line pointer can't be part of the chain. (We
697 : * already eliminated the case of dead root tuple outside this
698 : * function.)
699 : */
5680 tgl 700 GIC 14723328 : if (ItemIdIsDead(lp))
5680 tgl 701 LBC 0 : break;
5680 tgl 702 EUB :
5680 tgl 703 GIC 14723328 : Assert(ItemIdIsNormal(lp));
485 andres 704 CBC 14723328 : Assert(prstate->htsv[offnum] != -1);
5680 tgl 705 14723328 : htup = (HeapTupleHeader) PageGetItem(dp, lp);
5680 tgl 706 ECB :
707 : /*
708 : * Check the tuple XMIN against prior XMAX, if any
709 : */
5680 tgl 710 GIC 14898367 : if (TransactionIdIsValid(priorXmax) &&
1984 alvherre 711 CBC 175039 : !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
5680 tgl 712 LBC 0 : break;
5680 tgl 713 EUB :
714 : /*
715 : * OK, this tuple is indeed a member of the chain.
716 : */
5680 tgl 717 GIC 14723328 : chainitems[nchain++] = offnum;
5680 tgl 718 ECB :
719 : /*
720 : * Check tuple's visibility status.
721 : */
5680 tgl 722 GIC 14723328 : tupdead = recent_dead = false;
5680 tgl 723 ECB :
485 andres 724 GIC 14723328 : switch ((HTSV_Result) prstate->htsv[offnum])
5680 tgl 725 ECB : {
5680 tgl 726 GIC 1476657 : case HEAPTUPLE_DEAD:
5680 tgl 727 CBC 1476657 : tupdead = true;
728 1476657 : break;
5680 tgl 729 ECB :
5680 tgl 730 GIC 339932 : case HEAPTUPLE_RECENTLY_DEAD:
5680 tgl 731 CBC 339932 : recent_dead = true;
5624 bruce 732 ECB :
733 : /*
734 : * This tuple may soon become DEAD. Update the hint field so
735 : * that the page is reconsidered for pruning in future.
736 : */
5510 tgl 737 GIC 339932 : heap_prune_record_prunable(prstate,
3728 alvherre 738 CBC 339932 : HeapTupleHeaderGetUpdateXid(htup));
5680 tgl 739 339932 : break;
5680 tgl 740 ECB :
5680 tgl 741 GIC 14271 : case HEAPTUPLE_DELETE_IN_PROGRESS:
3260 bruce 742 ECB :
743 : /*
744 : * This tuple may soon become DEAD. Update the hint field so
745 : * that the page is reconsidered for pruning in future.
746 : */
3418 alvherre 747 GIC 14271 : heap_prune_record_prunable(prstate,
3418 alvherre 748 CBC 14271 : HeapTupleHeaderGetUpdateXid(htup));
5680 tgl 749 14271 : break;
5680 tgl 750 ECB :
5680 tgl 751 GIC 12892468 : case HEAPTUPLE_LIVE:
5680 tgl 752 ECB : case HEAPTUPLE_INSERT_IN_PROGRESS:
753 :
754 : /*
755 : * If we wanted to optimize for aborts, we might consider
756 : * marking the page prunable when we see INSERT_IN_PROGRESS.
757 : * But we don't. See related decisions about when to mark the
758 : * page prunable in heapam.c.
759 : */
5680 tgl 760 GIC 12892468 : break;
5680 tgl 761 ECB :
5680 tgl 762 UIC 0 : default:
5680 tgl 763 UBC 0 : elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
5680 tgl 764 EUB : break;
765 : }
766 :
767 : /*
768 : * Remember the last DEAD tuple seen. We will advance past
769 : * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
770 : * but we can't advance past anything else. We have to make sure that
771 : * we don't miss any DEAD tuples, since DEAD tuples that still have
772 : * tuple storage after pruning will confuse VACUUM.
773 : */
5680 tgl 774 GIC 14723328 : if (tupdead)
4859 simon 775 ECB : {
5680 tgl 776 GIC 1476657 : latestdead = offnum;
143 pg 777 GNC 1476657 : HeapTupleHeaderAdvanceConflictHorizon(htup,
778 : &prstate->snapshotConflictHorizon);
779 : }
5680 tgl 780 GIC 13246671 : else if (!recent_dead)
5680 tgl 781 CBC 12906739 : break;
5680 tgl 782 ECB :
783 : /*
784 : * If the tuple is not HOT-updated, then we are at the end of this
785 : * HOT-update chain.
786 : */
5680 tgl 787 GIC 1816589 : if (!HeapTupleHeaderIsHotUpdated(htup))
5680 tgl 788 ECB : break;
789 :
790 : /* HOT implies it can't have moved to different partition */
1828 andres 791 GIC 176028 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1828 andres 792 ECB :
793 : /*
794 : * Advance to next chain member.
795 : */
5680 tgl 796 GIC 176028 : Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
5680 tgl 797 ECB : BufferGetBlockNumber(buffer));
5680 tgl 798 GIC 176028 : offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
3728 alvherre 799 CBC 176028 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
5680 tgl 800 ECB : }
801 :
802 : /*
803 : * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
804 : * the DEAD tuples at the start of the chain are removed and the root line
805 : * pointer is appropriately redirected.
806 : */
5680 tgl 807 GIC 14549207 : if (OffsetNumberIsValid(latestdead))
5680 tgl 808 ECB : {
809 : /*
810 : * Mark as unused each intermediate item that we are able to remove
811 : * from the chain.
812 : *
813 : * When the previous item is the last dead tuple seen, we are at the
814 : * right candidate for redirection.
815 : */
5680 tgl 816 GIC 1509961 : for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
5680 tgl 817 ECB : {
5510 tgl 818 GIC 74717 : heap_prune_record_unused(prstate, chainitems[i]);
5680 tgl 819 CBC 74717 : ndeleted++;
5680 tgl 820 ECB : }
821 :
822 : /*
823 : * If the root entry had been a normal tuple, we are deleting it, so
824 : * count it in the result. But changing a redirect (even to DEAD
825 : * state) doesn't count.
826 : */
5680 tgl 827 GIC 1435244 : if (ItemIdIsNormal(rootlp))
5680 tgl 828 CBC 1401940 : ndeleted++;
5680 tgl 829 ECB :
830 : /*
831 : * If the DEAD tuple is at the end of the chain, the entire chain is
832 : * dead and the root line pointer can be marked dead. Otherwise just
833 : * redirect the root to the correct chain member.
834 : */
5680 tgl 835 GIC 1435244 : if (i >= nchain)
5510 tgl 836 CBC 1305612 : heap_prune_record_dead(prstate, rootoffnum);
5680 tgl 837 ECB : else
5510 tgl 838 GIC 129632 : heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
5680 tgl 839 ECB : }
5680 tgl 840 GIC 13113963 : else if (nchain < 2 && ItemIdIsRedirected(rootlp))
5680 tgl 841 ECB : {
842 : /*
843 : * We found a redirect item that doesn't point to a valid follow-on
844 : * item. This can happen if the loop in heap_page_prune caused us to
845 : * visit the dead successor of a redirect item before visiting the
846 : * redirect item. We can clean up by setting the redirect item to
847 : * DEAD state.
848 : */
5510 tgl 849 GIC 918 : heap_prune_record_dead(prstate, rootoffnum);
5510 tgl 850 ECB : }
851 :
5680 tgl 852 GIC 14549207 : return ndeleted;
5680 tgl 853 ECB : }
854 :
855 : /* Record lowest soon-prunable XID */
856 : static void
5510 tgl 857 GIC 354203 : heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
5510 tgl 858 ECB : {
859 : /*
860 : * This should exactly match the PageSetPrunable macro. We can't store
861 : * directly into the page header yet, so we update working state.
862 : */
5510 tgl 863 GIC 354203 : Assert(TransactionIdIsNormal(xid));
5510 tgl 864 CBC 698363 : if (!TransactionIdIsValid(prstate->new_prune_xid) ||
865 344160 : TransactionIdPrecedes(xid, prstate->new_prune_xid))
866 11185 : prstate->new_prune_xid = xid;
867 354203 : }
5680 tgl 868 ECB :
869 : /* Record line pointer to be redirected */
870 : static void
5510 tgl 871 GIC 129632 : heap_prune_record_redirect(PruneState *prstate,
5624 bruce 872 ECB : OffsetNumber offnum, OffsetNumber rdoffnum)
873 : {
5510 tgl 874 GIC 129632 : Assert(prstate->nredirected < MaxHeapTuplesPerPage);
5510 tgl 875 CBC 129632 : prstate->redirected[prstate->nredirected * 2] = offnum;
876 129632 : prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
877 129632 : prstate->nredirected++;
878 129632 : Assert(!prstate->marked[offnum]);
879 129632 : prstate->marked[offnum] = true;
880 129632 : Assert(!prstate->marked[rdoffnum]);
881 129632 : prstate->marked[rdoffnum] = true;
5680 882 129632 : }
5680 tgl 883 ECB :
884 : /* Record line pointer to be marked dead */
885 : static void
5510 tgl 886 GIC 1306530 : heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
5680 tgl 887 ECB : {
5510 tgl 888 GIC 1306530 : Assert(prstate->ndead < MaxHeapTuplesPerPage);
5510 tgl 889 CBC 1306530 : prstate->nowdead[prstate->ndead] = offnum;
890 1306530 : prstate->ndead++;
891 1306530 : Assert(!prstate->marked[offnum]);
892 1306530 : prstate->marked[offnum] = true;
5680 893 1306530 : }
5680 tgl 894 ECB :
895 : /* Record line pointer to be marked unused */
896 : static void
5510 tgl 897 GIC 77849 : heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
5680 tgl 898 ECB : {
5510 tgl 899 GIC 77849 : Assert(prstate->nunused < MaxHeapTuplesPerPage);
5510 tgl 900 CBC 77849 : prstate->nowunused[prstate->nunused] = offnum;
901 77849 : prstate->nunused++;
902 77849 : Assert(!prstate->marked[offnum]);
903 77849 : prstate->marked[offnum] = true;
904 77849 : }
5510 tgl 905 ECB :
906 :
907 : /*
908 : * Perform the actual page changes needed by heap_page_prune.
909 : * It is expected that the caller has a full cleanup lock on the
910 : * buffer.
911 : */
912 : void
5414 heikki.linnakangas 913 GIC 118566 : heap_page_prune_execute(Buffer buffer,
5510 tgl 914 ECB : OffsetNumber *redirected, int nredirected,
915 : OffsetNumber *nowdead, int ndead,
916 : OffsetNumber *nowunused, int nunused)
917 : {
2545 kgrittn 918 GIC 118566 : Page page = (Page) BufferGetPage(buffer);
5510 tgl 919 ECB : OffsetNumber *offnum;
920 : HeapTupleHeader htup PG_USED_FOR_ASSERTS_ONLY;
921 :
922 : /* Shouldn't be called unless there's something to do */
733 pg 923 GIC 118566 : Assert(nredirected > 0 || ndead > 0 || nunused > 0);
733 pg 924 ECB :
925 : /* Update all redirected line pointers */
5510 tgl 926 GIC 118566 : offnum = redirected;
521 pg 927 CBC 261341 : for (int i = 0; i < nredirected; i++)
5510 tgl 928 ECB : {
5510 tgl 929 GIC 142775 : OffsetNumber fromoff = *offnum++;
5510 tgl 930 CBC 142775 : OffsetNumber tooff = *offnum++;
931 142775 : ItemId fromlp = PageGetItemId(page, fromoff);
521 pg 932 ECB : ItemId tolp PG_USED_FOR_ASSERTS_ONLY;
933 :
934 : #ifdef USE_ASSERT_CHECKING
935 :
936 : /*
937 : * Any existing item that we set as an LP_REDIRECT (any 'from' item)
938 : * must be the first item from a HOT chain. If the item has tuple
939 : * storage then it can't be a heap-only tuple. Otherwise we are just
940 : * maintaining an existing LP_REDIRECT from an existing HOT chain that
941 : * has been pruned at least once before now.
942 : */
521 pg 943 GIC 142775 : if (!ItemIdIsRedirected(fromlp))
521 pg 944 ECB : {
521 pg 945 GIC 132479 : Assert(ItemIdHasStorage(fromlp) && ItemIdIsNormal(fromlp));
521 pg 946 ECB :
521 pg 947 GIC 132479 : htup = (HeapTupleHeader) PageGetItem(page, fromlp);
521 pg 948 CBC 132479 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
521 pg 949 ECB : }
950 : else
951 : {
952 : /* We shouldn't need to redundantly set the redirect */
521 pg 953 GIC 10296 : Assert(ItemIdGetRedirect(fromlp) != tooff);
521 pg 954 ECB : }
955 :
956 : /*
957 : * The item that we're about to set as an LP_REDIRECT (the 'from'
958 : * item) will point to an existing item (the 'to' item) that is
959 : * already a heap-only tuple. There can be at most one LP_REDIRECT
960 : * item per HOT chain.
961 : *
962 : * We need to keep around an LP_REDIRECT item (after original
963 : * non-heap-only root tuple gets pruned away) so that it's always
964 : * possible for VACUUM to easily figure out what TID to delete from
965 : * indexes when an entire HOT chain becomes dead. A heap-only tuple
966 : * can never become LP_DEAD; an LP_REDIRECT item or a regular heap
967 : * tuple can.
968 : *
969 : * This check may miss problems, e.g. the target of a redirect could
970 : * be marked as unused subsequently. The page_verify_redirects() check
971 : * below will catch such problems.
972 : */
521 pg 973 GIC 142775 : tolp = PageGetItemId(page, tooff);
521 pg 974 CBC 142775 : Assert(ItemIdHasStorage(tolp) && ItemIdIsNormal(tolp));
975 142775 : htup = (HeapTupleHeader) PageGetItem(page, tolp);
976 142775 : Assert(HeapTupleHeaderIsHeapOnly(htup));
521 pg 977 ECB : #endif
978 :
4808 tgl 979 GIC 142775 : ItemIdSetRedirect(fromlp, tooff);
5510 tgl 980 ECB : }
981 :
982 : /* Update all now-dead line pointers */
5510 tgl 983 GIC 118566 : offnum = nowdead;
521 pg 984 CBC 1685949 : for (int i = 0; i < ndead; i++)
5510 tgl 985 ECB : {
5510 tgl 986 GIC 1567383 : OffsetNumber off = *offnum++;
5510 tgl 987 CBC 1567383 : ItemId lp = PageGetItemId(page, off);
5510 tgl 988 ECB :
989 : #ifdef USE_ASSERT_CHECKING
990 :
991 : /*
992 : * An LP_DEAD line pointer must be left behind when the original item
993 : * (which is dead to everybody) could still be referenced by a TID in
994 : * an index. This should never be necessary with any individual
995 : * heap-only tuple item, though. (It's not clear how much of a problem
996 : * that would be, but there is no reason to allow it.)
997 : */
521 pg 998 GIC 1567383 : if (ItemIdHasStorage(lp))
521 pg 999 ECB : {
521 pg 1000 GIC 1542159 : Assert(ItemIdIsNormal(lp));
521 pg 1001 CBC 1542159 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1002 1542159 : Assert(!HeapTupleHeaderIsHeapOnly(htup));
521 pg 1003 ECB : }
1004 : else
1005 : {
1006 : /* Whole HOT chain becomes dead */
521 pg 1007 GIC 25224 : Assert(ItemIdIsRedirected(lp));
521 pg 1008 ECB : }
1009 : #endif
1010 :
5510 tgl 1011 GIC 1567383 : ItemIdSetDead(lp);
5510 tgl 1012 ECB : }
1013 :
1014 : /* Update all now-unused line pointers */
5510 tgl 1015 GIC 118566 : offnum = nowunused;
521 pg 1016 CBC 203209 : for (int i = 0; i < nunused; i++)
5510 tgl 1017 ECB : {
5510 tgl 1018 GIC 84643 : OffsetNumber off = *offnum++;
5510 tgl 1019 CBC 84643 : ItemId lp = PageGetItemId(page, off);
5510 tgl 1020 ECB :
1021 : #ifdef USE_ASSERT_CHECKING
1022 :
1023 : /*
1024 : * Only heap-only tuples can become LP_UNUSED during pruning. They
1025 : * don't need to be left in place as LP_DEAD items until VACUUM gets
1026 : * around to doing index vacuuming.
1027 : */
521 pg 1028 GIC 84643 : Assert(ItemIdHasStorage(lp) && ItemIdIsNormal(lp));
521 pg 1029 CBC 84643 : htup = (HeapTupleHeader) PageGetItem(page, lp);
1030 84643 : Assert(HeapTupleHeaderIsHeapOnly(htup));
521 pg 1031 ECB : #endif
1032 :
5510 tgl 1033 GIC 84643 : ItemIdSetUnused(lp);
5510 tgl 1034 ECB : }
1035 :
1036 : /*
1037 : * Finally, repair any fragmentation, and update the page's hint bit about
1038 : * whether it has free pointers.
1039 : */
5510 tgl 1040 GIC 118566 : PageRepairFragmentation(page);
503 andres 1041 ECB :
1042 : /*
1043 : * Now that the page has been modified, assert that redirect items still
1044 : * point to valid targets.
1045 : */
503 andres 1046 GIC 118566 : page_verify_redirects(page);
503 andres 1047 CBC 118566 : }
503 andres 1048 ECB :
1049 :
1050 : /*
1051 : * If built with assertions, verify that all LP_REDIRECT items point to a
1052 : * valid item.
1053 : *
1054 : * One way that bugs related to HOT pruning show is redirect items pointing to
1055 : * removed tuples. It's not trivial to reliably check that marking an item
1056 : * unused will not orphan a redirect item during heap_prune_chain() /
1057 : * heap_page_prune_execute(), so we additionally check the whole page after
1058 : * pruning. Without this check such bugs would typically only cause asserts
1059 : * later, potentially well after the corruption has been introduced.
1060 : *
1061 : * Also check comments in heap_page_prune_execute()'s redirection loop.
1062 : */
1063 : static void
503 andres 1064 GIC 118566 : page_verify_redirects(Page page)
503 andres 1065 ECB : {
1066 : #ifdef USE_ASSERT_CHECKING
1067 : OffsetNumber offnum;
1068 : OffsetNumber maxoff;
1069 :
503 andres 1070 GIC 118566 : maxoff = PageGetMaxOffsetNumber(page);
503 andres 1071 CBC 118566 : for (offnum = FirstOffsetNumber;
1072 7809329 : offnum <= maxoff;
1073 7690763 : offnum = OffsetNumberNext(offnum))
503 andres 1074 ECB : {
503 andres 1075 GIC 7690763 : ItemId itemid = PageGetItemId(page, offnum);
503 andres 1076 ECB : OffsetNumber targoff;
1077 : ItemId targitem;
1078 : HeapTupleHeader htup;
1079 :
503 andres 1080 GIC 7690763 : if (!ItemIdIsRedirected(itemid))
503 andres 1081 CBC 7044702 : continue;
503 andres 1082 ECB :
503 andres 1083 GIC 646061 : targoff = ItemIdGetRedirect(itemid);
503 andres 1084 CBC 646061 : targitem = PageGetItemId(page, targoff);
503 andres 1085 ECB :
503 andres 1086 GIC 646061 : Assert(ItemIdIsUsed(targitem));
503 andres 1087 CBC 646061 : Assert(ItemIdIsNormal(targitem));
1088 646061 : Assert(ItemIdHasStorage(targitem));
1089 646061 : htup = (HeapTupleHeader) PageGetItem(page, targitem);
1090 646061 : Assert(HeapTupleHeaderIsHeapOnly(htup));
503 andres 1091 ECB : }
1092 : #endif
5680 tgl 1093 GIC 118566 : }
5680 tgl 1094 ECB :
1095 :
1096 : /*
1097 : * For all items in this page, find their respective root line pointers.
1098 : * If item k is part of a HOT-chain with root at item j, then we set
1099 : * root_offsets[k - 1] = j.
1100 : *
1101 : * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
1102 : * Unused entries are filled with InvalidOffsetNumber (zero).
1103 : *
1104 : * The function must be called with at least share lock on the buffer, to
1105 : * prevent concurrent prune operations.
1106 : *
1107 : * Note: The information collected here is valid only as long as the caller
1108 : * holds a pin on the buffer. Once pin is released, a tuple might be pruned
1109 : * and reused by a completely unrelated tuple.
1110 : */
1111 : void
5680 tgl 1112 GIC 204041 : heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
5680 tgl 1113 ECB : {
1114 : OffsetNumber offnum,
1115 : maxoff;
1116 :
969 alvherre 1117 GIC 204041 : MemSet(root_offsets, InvalidOffsetNumber,
969 alvherre 1118 ECB : MaxHeapTuplesPerPage * sizeof(OffsetNumber));
1119 :
5680 tgl 1120 GIC 204041 : maxoff = PageGetMaxOffsetNumber(page);
5444 bruce 1121 CBC 14254285 : for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
5680 tgl 1122 ECB : {
5624 bruce 1123 GIC 14050244 : ItemId lp = PageGetItemId(page, offnum);
5624 bruce 1124 ECB : HeapTupleHeader htup;
1125 : OffsetNumber nextoffnum;
1126 : TransactionId priorXmax;
1127 :
1128 : /* skip unused and dead items */
5680 tgl 1129 GIC 14050244 : if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
5680 tgl 1130 CBC 2932 : continue;
5680 tgl 1131 ECB :
5680 tgl 1132 GIC 14047312 : if (ItemIdIsNormal(lp))
5680 tgl 1133 ECB : {
5680 tgl 1134 GIC 14045874 : htup = (HeapTupleHeader) PageGetItem(page, lp);
5680 tgl 1135 ECB :
1136 : /*
1137 : * Check if this tuple is part of a HOT-chain rooted at some other
1138 : * tuple. If so, skip it for now; we'll process it when we find
1139 : * its root.
1140 : */
5680 tgl 1141 GIC 14045874 : if (HeapTupleHeaderIsHeapOnly(htup))
5680 tgl 1142 CBC 1664 : continue;
5680 tgl 1143 ECB :
1144 : /*
1145 : * This is either a plain tuple or the root of a HOT-chain.
1146 : * Remember it in the mapping.
1147 : */
5680 tgl 1148 GIC 14044210 : root_offsets[offnum - 1] = offnum;
5680 tgl 1149 ECB :
1150 : /* If it's not the start of a HOT-chain, we're done with it */
5680 tgl 1151 GIC 14044210 : if (!HeapTupleHeaderIsHotUpdated(htup))
5680 tgl 1152 CBC 14044030 : continue;
5680 tgl 1153 ECB :
1154 : /* Set up to scan the HOT-chain */
5680 tgl 1155 GIC 180 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
3728 alvherre 1156 CBC 180 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
5680 tgl 1157 ECB : }
1158 : else
1159 : {
1160 : /* Must be a redirect item. We do not set its root_offsets entry */
5680 tgl 1161 GIC 1438 : Assert(ItemIdIsRedirected(lp));
5680 tgl 1162 ECB : /* Set up to scan the HOT-chain */
5680 tgl 1163 GIC 1438 : nextoffnum = ItemIdGetRedirect(lp);
5680 tgl 1164 CBC 1438 : priorXmax = InvalidTransactionId;
5680 tgl 1165 ECB : }
1166 :
1167 : /*
1168 : * Now follow the HOT-chain and collect other tuples in the chain.
1169 : *
1170 : * Note: Even though this is a nested loop, the complexity of the
1171 : * function is O(N) because a tuple in the page should be visited not
1172 : * more than twice, once in the outer loop and once in HOT-chain
1173 : * chases.
1174 : */
1175 : for (;;)
1176 : {
1177 : /* Sanity check (pure paranoia) */
564 pg 1178 GIC 1664 : if (offnum < FirstOffsetNumber)
564 pg 1179 LBC 0 : break;
564 pg 1180 EUB :
1181 : /*
1182 : * An offset past the end of page's line pointer array is possible
1183 : * when the array was truncated
1184 : */
564 pg 1185 GIC 1664 : if (offnum > maxoff)
732 pg 1186 LBC 0 : break;
732 pg 1187 EUB :
5680 tgl 1188 GIC 1664 : lp = PageGetItemId(page, nextoffnum);
5680 tgl 1189 ECB :
1190 : /* Check for broken chains */
5680 tgl 1191 GIC 1664 : if (!ItemIdIsNormal(lp))
5680 tgl 1192 LBC 0 : break;
5680 tgl 1193 EUB :
5680 tgl 1194 GIC 1664 : htup = (HeapTupleHeader) PageGetItem(page, lp);
5680 tgl 1195 ECB :
5680 tgl 1196 GIC 1890 : if (TransactionIdIsValid(priorXmax) &&
1984 alvherre 1197 CBC 226 : !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
5680 tgl 1198 LBC 0 : break;
5680 tgl 1199 EUB :
1200 : /* Remember the root line pointer for this item */
5680 tgl 1201 GIC 1664 : root_offsets[nextoffnum - 1] = offnum;
5680 tgl 1202 ECB :
1203 : /* Advance to next chain member, if any */
5680 tgl 1204 GIC 1664 : if (!HeapTupleHeaderIsHotUpdated(htup))
5680 tgl 1205 ECB : break;
1206 :
1207 : /* HOT implies it can't have moved to different partition */
1828 andres 1208 GIC 46 : Assert(!HeapTupleHeaderIndicatesMovedPartitions(htup));
1828 andres 1209 ECB :
5680 tgl 1210 GIC 46 : nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
3728 alvherre 1211 CBC 46 : priorXmax = HeapTupleHeaderGetUpdateXid(htup);
5680 tgl 1212 ECB : }
1213 : }
5680 tgl 1214 GIC 204041 : }
|