Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtdedup.c
4 : * Deduplicate or bottom-up delete items in Postgres btrees.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtdedup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include "access/nbtree.h"
18 : #include "access/nbtxlog.h"
19 : #include "access/xloginsert.h"
20 : #include "miscadmin.h"
21 : #include "utils/rel.h"
22 :
23 : static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
24 : TM_IndexDeleteOp *delstate);
25 : static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state,
26 : OffsetNumber minoff, IndexTuple newitem);
27 : static void _bt_singleval_fillfactor(Page page, BTDedupState state,
28 : Size newitemsz);
29 : #ifdef USE_ASSERT_CHECKING
30 : static bool _bt_posting_valid(IndexTuple posting);
31 : #endif
32 :
33 : /*
34 : * Perform a deduplication pass.
35 : *
36 : * The general approach taken here is to perform as much deduplication as
37 : * possible to free as much space as possible. Note, however, that "single
38 : * value" strategy is used for !bottomupdedup callers when the page is full of
39 : * tuples of a single value. Deduplication passes that apply the strategy
40 : * will leave behind a few untouched tuples at the end of the page, preparing
41 : * the page for an anticipated page split that uses nbtsplitloc.c's own single
42 : * value strategy. Our high level goal is to delay merging the untouched
43 : * tuples until after the page splits.
44 : *
45 : * When a call to _bt_bottomupdel_pass() just took place (and failed), our
46 : * high level goal is to prevent a page split entirely by buying more time.
47 : * We still hope that a page split can be avoided altogether. That's why
48 : * single value strategy is not even considered for bottomupdedup callers.
49 : *
50 : * The page will have to be split if we cannot successfully free at least
51 : * newitemsz (we also need space for newitem's line pointer, which isn't
52 : * included in caller's newitemsz).
53 : *
54 : * Note: Caller should have already deleted all existing items with their
55 : * LP_DEAD bits set.
56 : */
57 : void
873 pg 58 CBC 28659 : _bt_dedup_pass(Relation rel, Buffer buf, Relation heapRel, IndexTuple newitem,
59 : Size newitemsz, bool bottomupdedup)
60 : {
61 : OffsetNumber offnum,
62 : minoff,
63 : maxoff;
1138 64 28659 : Page page = BufferGetPage(buf);
373 michael 65 28659 : BTPageOpaque opaque = BTPageGetOpaque(page);
66 : Page newpage;
67 : BTDedupState state;
368 tgl 68 28659 : Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0;
1138 pg 69 28659 : bool singlevalstrat = false;
1107 70 28659 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
71 :
72 : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
1138 73 28659 : newitemsz += sizeof(ItemIdData);
74 :
75 : /*
76 : * Initialize deduplication state.
77 : *
78 : * It would be possible for maxpostingsize (limit on posting list tuple
79 : * size) to be set to one third of the page. However, it seems like a
80 : * good idea to limit the size of posting lists to one sixth of a page.
81 : * That ought to leave us with a good split point when pages full of
82 : * duplicates can be split several times.
83 : */
84 28659 : state = (BTDedupState) palloc(sizeof(BTDedupStateData));
85 28659 : state->deduplicate = true;
1024 86 28659 : state->nmaxitems = 0;
1138 87 28659 : state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
88 : /* Metadata about base tuple of current pending posting list */
89 28659 : state->base = NULL;
90 28659 : state->baseoff = InvalidOffsetNumber;
91 28659 : state->basetupsize = 0;
92 : /* Metadata about current pending posting list TIDs */
93 28659 : state->htids = palloc(state->maxpostingsize);
94 28659 : state->nhtids = 0;
95 28659 : state->nitems = 0;
96 : /* Size of all physical tuples to be replaced by pending posting list */
97 28659 : state->phystupsize = 0;
98 : /* nintervals should be initialized to zero */
99 28659 : state->nintervals = 0;
100 :
873 101 28659 : minoff = P_FIRSTDATAKEY(opaque);
102 28659 : maxoff = PageGetMaxOffsetNumber(page);
103 :
104 : /*
105 : * Consider applying "single value" strategy, though only if the page
106 : * seems likely to be split in the near future
107 : */
565 108 28659 : if (!bottomupdedup)
1138 109 25291 : singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem);
110 :
111 : /*
112 : * Deduplicate items from page, and write them to newpage.
113 : *
114 : * Copy the original page's LSN into newpage copy. This will become the
115 : * updated version of the page. We need this because XLogInsert will
116 : * examine the LSN and possibly dump it in a page image.
117 : */
118 28659 : newpage = PageGetTempPageCopySpecial(page);
119 28659 : PageSetLSN(newpage, PageGetLSN(page));
120 :
121 : /* Copy high key, if any */
122 28659 : if (!P_RIGHTMOST(opaque))
123 : {
124 21027 : ItemId hitemid = PageGetItemId(page, P_HIKEY);
125 21027 : Size hitemsz = ItemIdGetLength(hitemid);
126 21027 : IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid);
127 :
128 21027 : if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY,
129 : false, false) == InvalidOffsetNumber)
1138 pg 130 UBC 0 : elog(ERROR, "deduplication failed to add highkey");
131 : }
132 :
1138 pg 133 CBC 28659 : for (offnum = minoff;
134 6486288 : offnum <= maxoff;
135 6457629 : offnum = OffsetNumberNext(offnum))
136 : {
137 6457629 : ItemId itemid = PageGetItemId(page, offnum);
138 6457629 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
139 :
140 6457629 : Assert(!ItemIdIsDead(itemid));
141 :
142 6457629 : if (offnum == minoff)
143 : {
144 : /*
145 : * No previous/base tuple for the data item -- use the data item
146 : * as base tuple of pending posting list
147 : */
148 28659 : _bt_dedup_start_pending(state, itup, offnum);
149 : }
150 12856912 : else if (state->deduplicate &&
1107 151 7275837 : _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
1138 152 847895 : _bt_dedup_save_htid(state, itup))
153 : {
154 : /*
155 : * Tuple is equal to base tuple of pending posting list. Heap
156 : * TID(s) for itup have been saved in state.
157 : */
158 : }
159 : else
160 : {
161 : /*
162 : * Tuple is not equal to pending posting list tuple, or
163 : * _bt_dedup_save_htid() opted to not merge current item into
164 : * pending posting list for some other reason (e.g., adding more
165 : * TIDs would have caused posting list to exceed current
166 : * maxpostingsize).
167 : *
168 : * If state contains pending posting list with more than one item,
169 : * form new posting tuple and add it to our temp page (newpage).
170 : * Else add pending interval's base tuple to the temp page as-is.
171 : */
172 5591710 : pagesaving += _bt_dedup_finish_pending(newpage, state);
173 :
174 5591710 : if (singlevalstrat)
175 : {
176 : /*
177 : * Single value strategy's extra steps.
178 : *
179 : * Lower maxpostingsize for sixth and final large posting list
180 : * tuple at the point where 5 maxpostingsize-capped tuples
181 : * have either been formed or observed.
182 : *
183 : * When a sixth maxpostingsize-capped item is formed/observed,
184 : * stop merging together tuples altogether. The few tuples
185 : * that remain at the end of the page won't be merged together
186 : * at all (at least not until after a future page split takes
187 : * place, when this page's newly allocated right sibling page
188 : * gets its first deduplication pass).
189 : */
1024 pg 190 GIC 2380 : if (state->nmaxitems == 5)
1138 pg 191 CBC 318 : _bt_singleval_fillfactor(page, state, newitemsz);
1024 192 2062 : else if (state->nmaxitems == 6)
1138 pg 193 ECB : {
1138 pg 194 GIC 123 : state->deduplicate = false;
1138 pg 195 CBC 123 : singlevalstrat = false; /* won't be back here */
1138 pg 196 ECB : }
197 : }
198 :
199 : /* itup starts new pending posting list */
1138 pg 200 GIC 5591710 : _bt_dedup_start_pending(state, itup, offnum);
1138 pg 201 ECB : }
202 : }
203 :
204 : /* Handle the last item */
1138 pg 205 GIC 28659 : pagesaving += _bt_dedup_finish_pending(newpage, state);
1138 pg 206 ECB :
207 : /*
208 : * If no items suitable for deduplication were found, newpage must be
209 : * exactly the same as the original page, so just return from function.
210 : *
211 : * We could determine whether or not to proceed on the basis the space
212 : * savings being sufficient to avoid an immediate page split instead. We
213 : * don't do that because there is some small value in nbtsplitloc.c always
214 : * operating against a page that is fully deduplicated (apart from
215 : * newitem). Besides, most of the cost has already been paid.
216 : */
1138 pg 217 GIC 28659 : if (state->nintervals == 0)
1138 pg 218 ECB : {
219 : /* cannot leak memory here */
1138 pg 220 GIC 3729 : pfree(newpage);
1138 pg 221 CBC 3729 : pfree(state->htids);
222 3729 : pfree(state);
223 3729 : return;
1138 pg 224 ECB : }
225 :
226 : /*
227 : * By here, it's clear that deduplication will definitely go ahead.
228 : *
229 : * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace
230 : * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway.
231 : * But keep things tidy.
232 : */
1138 pg 233 GIC 24930 : if (P_HAS_GARBAGE(opaque))
1138 pg 234 ECB : {
373 michael 235 UIC 0 : BTPageOpaque nopaque = BTPageGetOpaque(newpage);
1138 pg 236 EUB :
1138 pg 237 UIC 0 : nopaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1138 pg 238 EUB : }
239 :
1138 pg 240 GIC 24930 : START_CRIT_SECTION();
1138 pg 241 ECB :
1138 pg 242 GIC 24930 : PageRestoreTempPage(newpage, page);
1138 pg 243 CBC 24930 : MarkBufferDirty(buf);
1138 pg 244 ECB :
245 : /* XLOG stuff */
1138 pg 246 GIC 24930 : if (RelationNeedsWAL(rel))
1138 pg 247 ECB : {
248 : XLogRecPtr recptr;
249 : xl_btree_dedup xlrec_dedup;
250 :
1138 pg 251 GIC 24867 : xlrec_dedup.nintervals = state->nintervals;
1138 pg 252 ECB :
1138 pg 253 GIC 24867 : XLogBeginInsert();
1138 pg 254 CBC 24867 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
255 24867 : XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup);
1138 pg 256 ECB :
257 : /*
258 : * The intervals array is not in the buffer, but pretend that it is.
259 : * When XLogInsert stores the whole buffer, the array need not be
260 : * stored too.
261 : */
1138 pg 262 GIC 24867 : XLogRegisterBufData(0, (char *) state->intervals,
1138 pg 263 CBC 24867 : state->nintervals * sizeof(BTDedupInterval));
1138 pg 264 ECB :
1138 pg 265 GIC 24867 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP);
1138 pg 266 ECB :
1138 pg 267 GIC 24867 : PageSetLSN(page, recptr);
1138 pg 268 ECB : }
269 :
1138 pg 270 GIC 24930 : END_CRIT_SECTION();
1138 pg 271 ECB :
272 : /* Local space accounting should agree with page accounting */
1138 pg 273 GIC 24930 : Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz);
1138 pg 274 ECB :
275 : /* cannot leak memory here */
1138 pg 276 GIC 24930 : pfree(state->htids);
1138 pg 277 CBC 24930 : pfree(state);
1138 pg 278 ECB : }
279 :
280 : /*
281 : * Perform bottom-up index deletion pass.
282 : *
283 : * See if duplicate index tuples (plus certain nearby tuples) are eligible to
284 : * be deleted via bottom-up index deletion. The high level goal here is to
285 : * entirely prevent "unnecessary" page splits caused by MVCC version churn
286 : * from UPDATEs (when the UPDATEs don't logically modify any of the columns
287 : * covered by the 'rel' index). This is qualitative, not quantitative -- we
288 : * do not particularly care about once-off opportunities to delete many index
289 : * tuples together.
290 : *
291 : * See nbtree/README for details on the design of nbtree bottom-up deletion.
292 : * See access/tableam.h for a description of how we're expected to cooperate
293 : * with the tableam.
294 : *
295 : * Returns true on success, in which case caller can assume page split will be
296 : * avoided for a reasonable amount of time. Returns false when caller should
297 : * deduplicate the page (if possible at all).
298 : *
299 : * Note: Occasionally we return true despite failing to delete enough items to
300 : * avoid a split. This makes caller skip deduplication and go split the page
301 : * right away. Our return value is always just advisory information.
302 : *
303 : * Note: Caller should have already deleted all existing items with their
304 : * LP_DEAD bits set.
305 : */
306 : bool
816 pg 307 GIC 3512 : _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel,
816 pg 308 ECB : Size newitemsz)
309 : {
310 : OffsetNumber offnum,
311 : minoff,
312 : maxoff;
816 pg 313 GIC 3512 : Page page = BufferGetPage(buf);
373 michael 314 CBC 3512 : BTPageOpaque opaque = BTPageGetOpaque(page);
816 pg 315 ECB : BTDedupState state;
316 : TM_IndexDeleteOp delstate;
317 : bool neverdedup;
816 pg 318 GIC 3512 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
816 pg 319 ECB :
320 : /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */
816 pg 321 GIC 3512 : newitemsz += sizeof(ItemIdData);
816 pg 322 ECB :
323 : /* Initialize deduplication state */
816 pg 324 GIC 3512 : state = (BTDedupState) palloc(sizeof(BTDedupStateData));
816 pg 325 CBC 3512 : state->deduplicate = true;
326 3512 : state->nmaxitems = 0;
327 3512 : state->maxpostingsize = BLCKSZ; /* We're not really deduplicating */
328 3512 : state->base = NULL;
329 3512 : state->baseoff = InvalidOffsetNumber;
330 3512 : state->basetupsize = 0;
331 3512 : state->htids = palloc(state->maxpostingsize);
332 3512 : state->nhtids = 0;
333 3512 : state->nitems = 0;
334 3512 : state->phystupsize = 0;
335 3512 : state->nintervals = 0;
816 pg 336 ECB :
337 : /*
338 : * Initialize tableam state that describes bottom-up index deletion
339 : * operation.
340 : *
341 : * We'll go on to ask the tableam to search for TIDs whose index tuples we
342 : * can safely delete. The tableam will search until our leaf page space
343 : * target is satisfied, or until the cost of continuing with the tableam
344 : * operation seems too high. It focuses its efforts on TIDs associated
345 : * with duplicate index tuples that we mark "promising".
346 : *
347 : * This space target is a little arbitrary. The tableam must be able to
348 : * keep the costs and benefits in balance. We provide the tableam with
349 : * exhaustive information about what might work, without directly
350 : * concerning ourselves with avoiding work during the tableam call. Our
351 : * role in costing the bottom-up deletion process is strictly advisory.
352 : */
521 pg 353 GIC 3512 : delstate.irel = rel;
521 pg 354 CBC 3512 : delstate.iblknum = BufferGetBlockNumber(buf);
816 355 3512 : delstate.bottomup = true;
356 3512 : delstate.bottomupfreespace = Max(BLCKSZ / 16, newitemsz);
357 3512 : delstate.ndeltids = 0;
358 3512 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
359 3512 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
816 pg 360 ECB :
816 pg 361 GIC 3512 : minoff = P_FIRSTDATAKEY(opaque);
816 pg 362 CBC 3512 : maxoff = PageGetMaxOffsetNumber(page);
363 3512 : for (offnum = minoff;
364 1030423 : offnum <= maxoff;
365 1026911 : offnum = OffsetNumberNext(offnum))
816 pg 366 ECB : {
816 pg 367 GIC 1026911 : ItemId itemid = PageGetItemId(page, offnum);
816 pg 368 CBC 1026911 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
816 pg 369 ECB :
816 pg 370 GIC 1026911 : Assert(!ItemIdIsDead(itemid));
816 pg 371 ECB :
816 pg 372 GIC 1026911 : if (offnum == minoff)
816 pg 373 ECB : {
374 : /* itup starts first pending interval */
816 pg 375 GIC 3512 : _bt_dedup_start_pending(state, itup, offnum);
816 pg 376 ECB : }
816 pg 377 GIC 1177990 : else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts &&
816 pg 378 CBC 154591 : _bt_dedup_save_htid(state, itup))
816 pg 379 ECB : {
380 : /* Tuple is equal; just added its TIDs to pending interval */
381 : }
382 : else
383 : {
384 : /* Finalize interval -- move its TIDs to delete state */
816 pg 385 GIC 868808 : _bt_bottomupdel_finish_pending(page, state, &delstate);
816 pg 386 ECB :
387 : /* itup starts new pending interval */
816 pg 388 GIC 868808 : _bt_dedup_start_pending(state, itup, offnum);
816 pg 389 ECB : }
390 : }
391 : /* Finalize final interval -- move its TIDs to delete state */
816 pg 392 GIC 3512 : _bt_bottomupdel_finish_pending(page, state, &delstate);
816 pg 393 ECB :
394 : /*
395 : * We don't give up now in the event of having few (or even zero)
396 : * promising tuples for the tableam because it's not up to us as the index
397 : * AM to manage costs (note that the tableam might have heuristics of its
398 : * own that work out what to do). We should at least avoid having our
399 : * caller do a useless deduplication pass after we return in the event of
400 : * zero promising tuples, though.
401 : */
816 pg 402 GIC 3512 : neverdedup = false;
816 pg 403 CBC 3512 : if (state->nintervals == 0)
404 5 : neverdedup = true;
816 pg 405 ECB :
816 pg 406 GIC 3512 : pfree(state->htids);
816 pg 407 CBC 3512 : pfree(state);
816 pg 408 ECB :
409 : /* Ask tableam which TIDs are deletable, then physically delete them */
816 pg 410 GIC 3512 : _bt_delitems_delete_check(rel, buf, heapRel, &delstate);
816 pg 411 ECB :
816 pg 412 GIC 3512 : pfree(delstate.deltids);
816 pg 413 CBC 3512 : pfree(delstate.status);
816 pg 414 ECB :
415 : /* Report "success" to caller unconditionally to avoid deduplication */
816 pg 416 GIC 3512 : if (neverdedup)
816 pg 417 CBC 5 : return true;
816 pg 418 ECB :
419 : /* Don't dedup when we won't end up back here any time soon anyway */
816 pg 420 GIC 3507 : return PageGetExactFreeSpace(page) >= Max(BLCKSZ / 24, newitemsz);
816 pg 421 ECB : }
422 :
423 : /*
424 : * Create a new pending posting list tuple based on caller's base tuple.
425 : *
426 : * Every tuple processed by deduplication either becomes the base tuple for a
427 : * posting list, or gets its heap TID(s) accepted into a pending posting list.
428 : * A tuple that starts out as the base tuple for a posting list will only
429 : * actually be rewritten within _bt_dedup_finish_pending() when it turns out
430 : * that there are duplicates that can be merged into the base tuple.
431 : */
432 : void
1138 pg 433 GIC 9274749 : _bt_dedup_start_pending(BTDedupState state, IndexTuple base,
1138 pg 434 ECB : OffsetNumber baseoff)
435 : {
1138 pg 436 GIC 9274749 : Assert(state->nhtids == 0);
1138 pg 437 CBC 9274749 : Assert(state->nitems == 0);
438 9274749 : Assert(!BTreeTupleIsPivot(base));
1138 pg 439 ECB :
440 : /*
441 : * Copy heap TID(s) from new base tuple for new candidate posting list
442 : * into working state's array
443 : */
1138 pg 444 GIC 9274749 : if (!BTreeTupleIsPosting(base))
1138 pg 445 ECB : {
1138 pg 446 GIC 7434111 : memcpy(state->htids, &base->t_tid, sizeof(ItemPointerData));
1138 pg 447 CBC 7434111 : state->nhtids = 1;
448 7434111 : state->basetupsize = IndexTupleSize(base);
1138 pg 449 ECB : }
450 : else
451 : {
452 : int nposting;
453 :
1138 pg 454 GIC 1840638 : nposting = BTreeTupleGetNPosting(base);
1138 pg 455 CBC 1840638 : memcpy(state->htids, BTreeTupleGetPosting(base),
1138 pg 456 ECB : sizeof(ItemPointerData) * nposting);
1138 pg 457 GIC 1840638 : state->nhtids = nposting;
1138 pg 458 ECB : /* basetupsize should not include existing posting list */
1138 pg 459 GIC 1840638 : state->basetupsize = BTreeTupleGetPostingOffset(base);
1138 pg 460 ECB : }
461 :
462 : /*
463 : * Save new base tuple itself -- it'll be needed if we actually create a
464 : * new posting list from new pending posting list.
465 : *
466 : * Must maintain physical size of all existing tuples (including line
467 : * pointer overhead) so that we can calculate space savings on page.
468 : */
1138 pg 469 GIC 9274749 : state->nitems = 1;
1138 pg 470 CBC 9274749 : state->base = base;
471 9274749 : state->baseoff = baseoff;
472 9274749 : state->phystupsize = MAXALIGN(IndexTupleSize(base)) + sizeof(ItemIdData);
1138 pg 473 ECB : /* Also save baseoff in pending state for interval */
1138 pg 474 GIC 9274749 : state->intervals[state->nintervals].baseoff = state->baseoff;
1138 pg 475 CBC 9274749 : }
1138 pg 476 ECB :
477 : /*
478 : * Save itup heap TID(s) into pending posting list where possible.
479 : *
480 : * Returns bool indicating if the pending posting list managed by state now
481 : * includes itup's heap TID(s).
482 : */
483 : bool
1138 pg 484 GIC 1749807 : _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
1138 pg 485 ECB : {
486 : int nhtids;
487 : ItemPointer htids;
488 : Size mergedtupsz;
489 :
1138 pg 490 GIC 1749807 : Assert(!BTreeTupleIsPivot(itup));
1138 pg 491 ECB :
1138 pg 492 GIC 1749807 : if (!BTreeTupleIsPosting(itup))
1138 pg 493 ECB : {
1138 pg 494 GIC 1739831 : nhtids = 1;
1138 pg 495 CBC 1739831 : htids = &itup->t_tid;
1138 pg 496 ECB : }
497 : else
498 : {
1138 pg 499 GIC 9976 : nhtids = BTreeTupleGetNPosting(itup);
1138 pg 500 CBC 9976 : htids = BTreeTupleGetPosting(itup);
1138 pg 501 ECB : }
502 :
503 : /*
504 : * Don't append (have caller finish pending posting list as-is) if
505 : * appending heap TID(s) from itup would put us over maxpostingsize limit.
506 : *
507 : * This calculation needs to match the code used within _bt_form_posting()
508 : * for new posting list tuples.
509 : */
1138 pg 510 GIC 1749807 : mergedtupsz = MAXALIGN(state->basetupsize +
1138 pg 511 ECB : (state->nhtids + nhtids) * sizeof(ItemPointerData));
512 :
1138 pg 513 GIC 1749807 : if (mergedtupsz > state->maxpostingsize)
1024 pg 514 ECB : {
515 : /*
516 : * Count this as an oversized item for single value strategy, though
517 : * only when there are 50 TIDs in the final posting list tuple. This
518 : * limit (which is fairly arbitrary) avoids confusion about how many
519 : * 1/6 of a page tuples have been encountered/created by the current
520 : * deduplication pass.
521 : *
522 : * Note: We deliberately don't consider which deduplication pass
523 : * merged together tuples to create this item (could be a previous
524 : * deduplication pass, or current pass). See _bt_do_singleval()
525 : * comments.
526 : */
1024 pg 527 GIC 14378 : if (state->nhtids > 50)
1024 pg 528 CBC 13882 : state->nmaxitems++;
1024 pg 529 ECB :
1138 pg 530 GIC 14378 : return false;
1024 pg 531 ECB : }
532 :
533 : /*
534 : * Save heap TIDs to pending posting list tuple -- itup can be merged into
535 : * pending posting list
536 : */
1138 pg 537 GIC 1735429 : state->nitems++;
1138 pg 538 CBC 1735429 : memcpy(state->htids + state->nhtids, htids,
1138 pg 539 ECB : sizeof(ItemPointerData) * nhtids);
1138 pg 540 GIC 1735429 : state->nhtids += nhtids;
1138 pg 541 CBC 1735429 : state->phystupsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
1138 pg 542 ECB :
1138 pg 543 GIC 1735429 : return true;
1138 pg 544 ECB : }
545 :
546 : /*
547 : * Finalize pending posting list tuple, and add it to the page. Final tuple
548 : * is based on saved base tuple, and saved list of heap TIDs.
549 : *
550 : * Returns space saving from deduplicating to make a new posting list tuple.
551 : * Note that this includes line pointer overhead. This is zero in the case
552 : * where no deduplication was possible.
553 : */
554 : Size
1138 pg 555 GIC 6000318 : _bt_dedup_finish_pending(Page newpage, BTDedupState state)
1138 pg 556 ECB : {
557 : OffsetNumber tupoff;
558 : Size tuplesz;
559 : Size spacesaving;
560 :
1138 pg 561 GIC 6000318 : Assert(state->nitems > 0);
1138 pg 562 CBC 6000318 : Assert(state->nitems <= state->nhtids);
563 6000318 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
1138 pg 564 ECB :
1138 pg 565 GIC 6000318 : tupoff = OffsetNumberNext(PageGetMaxOffsetNumber(newpage));
1138 pg 566 CBC 6000318 : if (state->nitems == 1)
1138 pg 567 ECB : {
568 : /* Use original, unchanged base tuple */
1138 pg 569 GIC 5657980 : tuplesz = IndexTupleSize(state->base);
247 pg 570 GNC 5657980 : Assert(tuplesz == MAXALIGN(IndexTupleSize(state->base)));
571 5657980 : Assert(tuplesz <= BTMaxItemSize(newpage));
1138 pg 572 CBC 5657980 : if (PageAddItem(newpage, (Item) state->base, tuplesz, tupoff,
1138 pg 573 ECB : false, false) == InvalidOffsetNumber)
1138 pg 574 LBC 0 : elog(ERROR, "deduplication failed to add tuple to page");
1138 pg 575 ECB :
1138 pg 576 GIC 5657980 : spacesaving = 0;
1138 pg 577 EUB : }
578 : else
1138 pg 579 ECB : {
580 : IndexTuple final;
581 :
582 : /* Form a tuple with a posting list */
1138 pg 583 GIC 342338 : final = _bt_form_posting(state->base, state->htids, state->nhtids);
584 342338 : tuplesz = IndexTupleSize(final);
585 342338 : Assert(tuplesz <= state->maxpostingsize);
1138 pg 586 ECB :
587 : /* Save final number of items for posting list */
1138 pg 588 CBC 342338 : state->intervals[state->nintervals].nitems = state->nitems;
589 :
1138 pg 590 GIC 342338 : Assert(tuplesz == MAXALIGN(IndexTupleSize(final)));
247 pg 591 GNC 342338 : Assert(tuplesz <= BTMaxItemSize(newpage));
1138 pg 592 CBC 342338 : if (PageAddItem(newpage, (Item) final, tuplesz, tupoff, false,
593 : false) == InvalidOffsetNumber)
1138 pg 594 LBC 0 : elog(ERROR, "deduplication failed to add tuple to page");
1138 pg 595 ECB :
1138 pg 596 CBC 342338 : pfree(final);
1138 pg 597 GIC 342338 : spacesaving = state->phystupsize - (tuplesz + sizeof(ItemIdData));
1138 pg 598 EUB : /* Increment nintervals, since we wrote a new posting list tuple */
1138 pg 599 GIC 342338 : state->nintervals++;
1138 pg 600 CBC 342338 : Assert(spacesaving > 0 && spacesaving < BLCKSZ);
1138 pg 601 ECB : }
602 :
603 : /* Reset state for next pending posting list */
1138 pg 604 CBC 6000318 : state->nhtids = 0;
1138 pg 605 GIC 6000318 : state->nitems = 0;
606 6000318 : state->phystupsize = 0;
607 :
1138 pg 608 CBC 6000318 : return spacesaving;
1138 pg 609 ECB : }
610 :
611 : /*
816 612 : * Finalize interval during bottom-up index deletion.
613 : *
614 : * During a bottom-up pass we expect that TIDs will be recorded in dedup state
615 : * first, and then get moved over to delstate (in variable-sized batches) by
616 : * calling here. Call here happens when the number of TIDs in a dedup
617 : * interval is known, and interval gets finalized (i.e. when caller sees next
618 : * tuple on the page is not a duplicate, or when caller runs out of tuples to
619 : * process from leaf page).
620 : *
621 : * This is where bottom-up deletion determines and remembers which entries are
622 : * duplicates. This will be important information to the tableam delete
623 : * infrastructure later on. Plain index tuple duplicates are marked
624 : * "promising" here, per tableam contract.
625 : *
626 : * Our approach to marking entries whose TIDs come from posting lists is more
627 : * complicated. Posting lists can only be formed by a deduplication pass (or
628 : * during an index build), so recent version churn affecting the pointed-to
629 : * logical rows is not particularly likely. We may still give a weak signal
630 : * about posting list tuples' entries (by marking just one of its TIDs/entries
631 : * promising), though this is only a possibility in the event of further
632 : * duplicate index tuples in final interval that covers posting list tuple (as
633 : * in the plain tuple case). A weak signal/hint will be useful to the tableam
634 : * when it has no stronger signal to go with for the deletion operation as a
635 : * whole.
636 : *
637 : * The heuristics we use work well in practice because we only need to give
638 : * the tableam the right _general_ idea about where to look. Garbage tends to
639 : * naturally get concentrated in relatively few table blocks with workloads
640 : * that bottom-up deletion targets. The tableam cannot possibly rank all
641 : * available table blocks sensibly based on the hints we provide, but that's
642 : * okay -- only the extremes matter. The tableam just needs to be able to
643 : * predict which few table blocks will have the most tuples that are safe to
644 : * delete for each deletion operation, with low variance across related
645 : * deletion operations.
646 : */
647 : static void
816 pg 648 GIC 872320 : _bt_bottomupdel_finish_pending(Page page, BTDedupState state,
649 : TM_IndexDeleteOp *delstate)
650 : {
651 872320 : bool dupinterval = (state->nitems > 1);
816 pg 652 ECB :
816 pg 653 GIC 872320 : Assert(state->nitems > 0);
654 872320 : Assert(state->nitems <= state->nhtids);
816 pg 655 CBC 872320 : Assert(state->intervals[state->nintervals].baseoff == state->baseoff);
656 :
657 1899231 : for (int i = 0; i < state->nitems; i++)
816 pg 658 ECB : {
816 pg 659 CBC 1026911 : OffsetNumber offnum = state->baseoff + i;
816 pg 660 GIC 1026911 : ItemId itemid = PageGetItemId(page, offnum);
816 pg 661 CBC 1026911 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
816 pg 662 GIC 1026911 : TM_IndexDelete *ideltid = &delstate->deltids[delstate->ndeltids];
816 pg 663 CBC 1026911 : TM_IndexStatus *istatus = &delstate->status[delstate->ndeltids];
816 pg 664 ECB :
816 pg 665 CBC 1026911 : if (!BTreeTupleIsPosting(itup))
816 pg 666 ECB : {
667 : /* Simple case: A plain non-pivot tuple */
816 pg 668 GIC 858455 : ideltid->tid = itup->t_tid;
816 pg 669 CBC 858455 : ideltid->id = delstate->ndeltids;
816 pg 670 GIC 858455 : istatus->idxoffnum = offnum;
671 858455 : istatus->knowndeletable = false; /* for now */
816 pg 672 CBC 858455 : istatus->promising = dupinterval; /* simple rule */
673 858455 : istatus->freespace = ItemIdGetLength(itemid) + sizeof(ItemIdData);
816 pg 674 ECB :
816 pg 675 CBC 858455 : delstate->ndeltids++;
816 pg 676 ECB : }
677 : else
678 : {
679 : /*
680 : * Complicated case: A posting list tuple.
681 : *
682 : * We make the conservative assumption that there can only be at
683 : * most one affected logical row per posting list tuple. There
684 : * will be at most one promising entry in deltids to represent
685 : * this presumed lone logical row. Note that this isn't even
686 : * considered unless the posting list tuple is also in an interval
687 : * of duplicates -- this complicated rule is just a variant of the
688 : * simple rule used to decide if plain index tuples are promising.
689 : */
816 pg 690 GIC 168456 : int nitem = BTreeTupleGetNPosting(itup);
691 168456 : bool firstpromising = false;
692 168456 : bool lastpromising = false;
693 :
816 pg 694 CBC 168456 : Assert(_bt_posting_valid(itup));
816 pg 695 ECB :
816 pg 696 CBC 168456 : if (dupinterval)
697 : {
816 pg 698 ECB : /*
699 : * Complicated rule: either the first or last TID in the
700 : * posting list gets marked promising (if any at all)
701 : */
702 : BlockNumber minblocklist,
703 : midblocklist,
704 : maxblocklist;
705 : ItemPointer mintid,
706 : midtid,
707 : maxtid;
708 :
816 pg 709 GIC 10036 : mintid = BTreeTupleGetHeapTID(itup);
710 10036 : midtid = BTreeTupleGetPostingN(itup, nitem / 2);
711 10036 : maxtid = BTreeTupleGetMaxHeapTID(itup);
712 10036 : minblocklist = ItemPointerGetBlockNumber(mintid);
816 pg 713 CBC 10036 : midblocklist = ItemPointerGetBlockNumber(midtid);
714 10036 : maxblocklist = ItemPointerGetBlockNumber(maxtid);
816 pg 715 ECB :
716 : /* Only entry with predominant table block can be promising */
816 pg 717 CBC 10036 : firstpromising = (minblocklist == midblocklist);
718 10036 : lastpromising = (!firstpromising &&
719 : midblocklist == maxblocklist);
720 : }
816 pg 721 ECB :
816 pg 722 CBC 791085 : for (int p = 0; p < nitem; p++)
723 : {
816 pg 724 GIC 622629 : ItemPointer htid = BTreeTupleGetPostingN(itup, p);
725 :
816 pg 726 CBC 622629 : ideltid->tid = *htid;
816 pg 727 GIC 622629 : ideltid->id = delstate->ndeltids;
816 pg 728 CBC 622629 : istatus->idxoffnum = offnum;
816 pg 729 GIC 622629 : istatus->knowndeletable = false; /* for now */
816 pg 730 CBC 622629 : istatus->promising = false;
731 622629 : if ((firstpromising && p == 0) ||
732 68395 : (lastpromising && p == nitem - 1))
733 6802 : istatus->promising = true;
734 622629 : istatus->freespace = sizeof(ItemPointerData); /* at worst */
816 pg 735 ECB :
816 pg 736 CBC 622629 : ideltid++;
737 622629 : istatus++;
738 622629 : delstate->ndeltids++;
739 : }
816 pg 740 ECB : }
741 : }
742 :
816 pg 743 GIC 872320 : if (dupinterval)
744 : {
745 95295 : state->intervals[state->nintervals].nitems = state->nitems;
746 95295 : state->nintervals++;
816 pg 747 ECB : }
748 :
749 : /* Reset state for next interval */
816 pg 750 CBC 872320 : state->nhtids = 0;
816 pg 751 GIC 872320 : state->nitems = 0;
752 872320 : state->phystupsize = 0;
753 872320 : }
816 pg 754 ECB :
1138 755 : /*
756 : * Determine if page non-pivot tuples (data items) are all duplicates of the
757 : * same value -- if they are, deduplication's "single value" strategy should
758 : * be applied. The general goal of this strategy is to ensure that
759 : * nbtsplitloc.c (which uses its own single value strategy) will find a useful
760 : * split point as further duplicates are inserted, and successive rightmost
761 : * page splits occur among pages that store the same duplicate value. When
762 : * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full,
763 : * just like it would if deduplication were disabled.
764 : *
765 : * We expect that affected workloads will require _several_ single value
766 : * strategy deduplication passes (over a page that only stores duplicates)
767 : * before the page is finally split. The first deduplication pass should only
768 : * find regular non-pivot tuples. Later deduplication passes will find
769 : * existing maxpostingsize-capped posting list tuples, which must be skipped
770 : * over. The penultimate pass is generally the first pass that actually
771 : * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a
772 : * few untouched non-pivot tuples. The final deduplication pass won't free
773 : * any space -- it will skip over everything without merging anything (it
774 : * retraces the steps of the penultimate pass).
775 : *
776 : * Fortunately, having several passes isn't too expensive. Each pass (after
777 : * the first pass) won't spend many cycles on the large posting list tuples
778 : * left by previous passes. Each pass will find a large contiguous group of
779 : * smaller duplicate tuples to merge together at the end of the page.
780 : */
781 : static bool
1138 pg 782 GIC 25291 : _bt_do_singleval(Relation rel, Page page, BTDedupState state,
783 : OffsetNumber minoff, IndexTuple newitem)
784 : {
1107 785 25291 : int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
1138 pg 786 ECB : ItemId itemid;
787 : IndexTuple itup;
788 :
1138 pg 789 CBC 25291 : itemid = PageGetItemId(page, minoff);
1138 pg 790 GIC 25291 : itup = (IndexTuple) PageGetItem(page, itemid);
791 :
1107 792 25291 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
1138 pg 793 ECB : {
1138 pg 794 CBC 926 : itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page));
1138 pg 795 GIC 926 : itup = (IndexTuple) PageGetItem(page, itemid);
1138 pg 796 ECB :
1107 pg 797 GIC 926 : if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts)
1138 pg 798 CBC 568 : return true;
1138 pg 799 ECB : }
800 :
1138 pg 801 CBC 24723 : return false;
1138 pg 802 ECB : }
803 :
804 : /*
805 : * Lower maxpostingsize when using "single value" strategy, to avoid a sixth
806 : * and final maxpostingsize-capped tuple. The sixth and final posting list
807 : * tuple will end up somewhat smaller than the first five. (Note: The first
808 : * five tuples could actually just be very large duplicate tuples that
809 : * couldn't be merged together at all. Deduplication will simply not modify
810 : * the page when that happens.)
811 : *
812 : * When there are six posting lists on the page (after current deduplication
813 : * pass goes on to create/observe a sixth very large tuple), caller should end
814 : * its deduplication pass. It isn't useful to try to deduplicate items that
815 : * are supposed to end up on the new right sibling page following the
816 : * anticipated page split. A future deduplication pass of future right
817 : * sibling page might take care of it. (This is why the first single value
818 : * strategy deduplication pass for a given leaf page will generally find only
819 : * plain non-pivot tuples -- see _bt_do_singleval() comments.)
820 : */
821 : static void
1138 pg 822 GIC 318 : _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz)
823 : {
824 : Size leftfree;
825 : int reduction;
1138 pg 826 ECB :
827 : /* This calculation needs to match nbtsplitloc.c */
1138 pg 828 GIC 318 : leftfree = PageGetPageSize(page) - SizeOfPageHeaderData -
829 : MAXALIGN(sizeof(BTPageOpaqueData));
830 : /* Subtract size of new high key (includes pivot heap TID space) */
831 318 : leftfree -= newitemsz + MAXALIGN(sizeof(ItemPointerData));
1138 pg 832 ECB :
833 : /*
834 : * Reduce maxpostingsize by an amount equal to target free space on left
835 : * half of page
836 : */
1138 pg 837 GIC 318 : reduction = leftfree * ((100 - BTREE_SINGLEVAL_FILLFACTOR) / 100.0);
838 318 : if (state->maxpostingsize > reduction)
839 318 : state->maxpostingsize -= reduction;
840 : else
1138 pg 841 LBC 0 : state->maxpostingsize = 0;
1138 pg 842 CBC 318 : }
1138 pg 843 ECB :
844 : /*
1138 pg 845 EUB : * Build a posting list tuple based on caller's "base" index tuple and list of
1138 pg 846 ECB : * heap TIDs. When nhtids == 1, builds a standard non-pivot tuple without a
847 : * posting list. (Posting list tuples can never have a single heap TID, partly
848 : * because that ensures that deduplication always reduces final MAXALIGN()'d
849 : * size of entire tuple.)
850 : *
851 : * Convention is that posting list starts at a MAXALIGN()'d offset (rather
852 : * than a SHORTALIGN()'d offset), in line with the approach taken when
853 : * appending a heap TID to new pivot tuple/high key during suffix truncation.
854 : * This sometimes wastes a little space that was only needed as alignment
855 : * padding in the original tuple. Following this convention simplifies the
856 : * space accounting used when deduplicating a page (the same convention
857 : * simplifies the accounting for choosing a point to split a page at).
858 : *
859 : * Note: Caller's "htids" array must be unique and already in ascending TID
860 : * order. Any existing heap TIDs from "base" won't automatically appear in
861 : * returned posting list tuple (they must be included in htids array.)
862 : */
863 : IndexTuple
1138 pg 864 GIC 381463 : _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
865 : {
866 : uint32 keysize,
867 : newsize;
1138 pg 868 ECB : IndexTuple itup;
869 :
1138 pg 870 GIC 381463 : if (BTreeTupleIsPosting(base))
871 69051 : keysize = BTreeTupleGetPostingOffset(base);
872 : else
873 312412 : keysize = IndexTupleSize(base);
1138 pg 874 ECB :
1138 pg 875 CBC 381463 : Assert(!BTreeTupleIsPivot(base));
1138 pg 876 GIC 381463 : Assert(nhtids > 0 && nhtids <= PG_UINT16_MAX);
1138 pg 877 CBC 381463 : Assert(keysize == MAXALIGN(keysize));
878 :
1138 pg 879 ECB : /* Determine final size of new tuple */
1138 pg 880 CBC 381463 : if (nhtids > 1)
881 372094 : newsize = MAXALIGN(keysize +
882 : nhtids * sizeof(ItemPointerData));
883 : else
884 9369 : newsize = keysize;
1138 pg 885 ECB :
1138 pg 886 GIC 381463 : Assert(newsize <= INDEX_SIZE_MASK);
887 381463 : Assert(newsize == MAXALIGN(newsize));
1138 pg 888 ECB :
889 : /* Allocate memory using palloc0() (matches index_form_tuple()) */
1138 pg 890 CBC 381463 : itup = palloc0(newsize);
891 381463 : memcpy(itup, base, keysize);
1138 pg 892 GIC 381463 : itup->t_info &= ~INDEX_SIZE_MASK;
893 381463 : itup->t_info |= newsize;
1138 pg 894 CBC 381463 : if (nhtids > 1)
1138 pg 895 ECB : {
896 : /* Form posting list tuple */
1138 pg 897 CBC 372094 : BTreeTupleSetPosting(itup, nhtids, keysize);
898 372094 : memcpy(BTreeTupleGetPosting(itup), htids,
899 : sizeof(ItemPointerData) * nhtids);
1138 pg 900 GIC 372094 : Assert(_bt_posting_valid(itup));
1138 pg 901 ECB : }
902 : else
903 : {
904 : /* Form standard non-pivot tuple */
1138 pg 905 GIC 9369 : itup->t_info &= ~INDEX_ALT_TID_MASK;
906 9369 : ItemPointerCopy(htids, &itup->t_tid);
907 9369 : Assert(ItemPointerIsValid(&itup->t_tid));
908 : }
1138 pg 909 ECB :
1138 pg 910 CBC 381463 : return itup;
1138 pg 911 ECB : }
912 :
913 : /*
914 : * Generate a replacement tuple by "updating" a posting list tuple so that it
915 : * no longer has TIDs that need to be deleted.
916 : *
917 : * Used by both VACUUM and index deletion. Caller's vacposting argument
918 : * points to the existing posting list tuple to be updated.
919 : *
920 : * On return, caller's vacposting argument will point to final "updated"
921 : * tuple, which will be palloc()'d in caller's memory context.
922 : */
923 : void
1138 pg 924 GIC 87661 : _bt_update_posting(BTVacuumPosting vacposting)
925 : {
926 87661 : IndexTuple origtuple = vacposting->itup;
927 : uint32 keysize,
1138 pg 928 ECB : newsize;
929 : IndexTuple itup;
930 : int nhtids;
931 : int ui,
932 : d;
933 : ItemPointer htids;
934 :
1138 pg 935 GIC 87661 : nhtids = BTreeTupleGetNPosting(origtuple) - vacposting->ndeletedtids;
936 :
937 87661 : Assert(_bt_posting_valid(origtuple));
938 87661 : Assert(nhtids > 0 && nhtids < BTreeTupleGetNPosting(origtuple));
1138 pg 939 ECB :
940 : /*
941 : * Determine final size of new tuple.
942 : *
943 : * This calculation needs to match the code used within _bt_form_posting()
944 : * for new posting list tuples. We avoid calling _bt_form_posting() here
945 : * to save ourselves a second memory allocation for a htids workspace.
946 : */
1134 pg 947 GIC 87661 : keysize = BTreeTupleGetPostingOffset(origtuple);
1138 948 87661 : if (nhtids > 1)
949 6637 : newsize = MAXALIGN(keysize +
950 : nhtids * sizeof(ItemPointerData));
1138 pg 951 ECB : else
1138 pg 952 CBC 81024 : newsize = keysize;
1138 pg 953 ECB :
1133 pg 954 GIC 87661 : Assert(newsize <= INDEX_SIZE_MASK);
955 87661 : Assert(newsize == MAXALIGN(newsize));
1133 pg 956 ECB :
957 : /* Allocate memory using palloc0() (matches index_form_tuple()) */
1138 pg 958 CBC 87661 : itup = palloc0(newsize);
959 87661 : memcpy(itup, origtuple, keysize);
1138 pg 960 GIC 87661 : itup->t_info &= ~INDEX_SIZE_MASK;
961 87661 : itup->t_info |= newsize;
1138 pg 962 ECB :
1138 pg 963 CBC 87661 : if (nhtids > 1)
1138 pg 964 ECB : {
965 : /* Form posting list tuple */
1138 pg 966 GIC 6637 : BTreeTupleSetPosting(itup, nhtids, keysize);
1138 pg 967 CBC 6637 : htids = BTreeTupleGetPosting(itup);
968 : }
969 : else
1138 pg 970 ECB : {
971 : /* Form standard non-pivot tuple */
1138 pg 972 GIC 81024 : itup->t_info &= ~INDEX_ALT_TID_MASK;
973 81024 : htids = &itup->t_tid;
974 : }
975 :
1138 pg 976 CBC 87661 : ui = 0;
977 87661 : d = 0;
1138 pg 978 GIC 372085 : for (int i = 0; i < BTreeTupleGetNPosting(origtuple); i++)
979 : {
1138 pg 980 CBC 284424 : if (d < vacposting->ndeletedtids && vacposting->deletetids[d] == i)
1138 pg 981 ECB : {
1138 pg 982 CBC 120806 : d++;
1138 pg 983 GIC 120806 : continue;
1138 pg 984 ECB : }
1138 pg 985 GIC 163618 : htids[ui++] = *BTreeTupleGetPostingN(origtuple, i);
1138 pg 986 ECB : }
1138 pg 987 CBC 87661 : Assert(ui == nhtids);
1138 pg 988 GIC 87661 : Assert(d == vacposting->ndeletedtids);
1138 pg 989 CBC 87661 : Assert(nhtids == 1 || _bt_posting_valid(itup));
1133 pg 990 GIC 87661 : Assert(nhtids > 1 || ItemPointerIsValid(&itup->t_tid));
1138 pg 991 ECB :
992 : /* vacposting arg's itup will now point to updated version */
1138 pg 993 CBC 87661 : vacposting->itup = itup;
994 87661 : }
995 :
996 : /*
1138 pg 997 ECB : * Prepare for a posting list split by swapping heap TID in newitem with heap
998 : * TID from original posting list (the 'oposting' heap TID located at offset
999 : * 'postingoff'). Modifies newitem, so caller should pass their own private
1000 : * copy that can safely be modified.
1001 : *
1002 : * Returns new posting list tuple, which is palloc()'d in caller's context.
1003 : * This is guaranteed to be the same size as 'oposting'. Modified newitem is
1004 : * what caller actually inserts. (This happens inside the same critical
1005 : * section that performs an in-place update of old posting list using new
1006 : * posting list returned here.)
1007 : *
1008 : * While the keys from newitem and oposting must be opclass equal, and must
1009 : * generate identical output when run through the underlying type's output
1010 : * function, it doesn't follow that their representations match exactly.
1011 : * Caller must avoid assuming that there can't be representational differences
1012 : * that make datums from oposting bigger or smaller than the corresponding
1013 : * datums from newitem. For example, differences in TOAST input state might
1014 : * break a faulty assumption about tuple size (the executor is entitled to
1015 : * apply TOAST compression based on its own criteria). It also seems possible
1016 : * that further representational variation will be introduced in the future,
1017 : * in order to support nbtree features like page-level prefix compression.
1018 : *
1019 : * See nbtree/README for details on the design of posting list splits.
1020 : */
1021 : IndexTuple
1138 pg 1022 GIC 9794 : _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
1023 : {
1024 : int nhtids;
1025 : char *replacepos;
1138 pg 1026 ECB : char *replaceposright;
1027 : Size nmovebytes;
1028 : IndexTuple nposting;
1029 :
1138 pg 1030 GIC 9794 : nhtids = BTreeTupleGetNPosting(oposting);
1031 9794 : Assert(_bt_posting_valid(oposting));
1032 :
1033 : /*
695 pg 1034 ECB : * The postingoff argument originated as a _bt_binsrch_posting() return
1035 : * value. It will be 0 in the event of corruption that makes a leaf page
1036 : * contain a non-pivot tuple that's somehow identical to newitem (no two
1037 : * non-pivot tuples should ever have the same TID). This has been known
1038 : * to happen in the field from time to time.
1039 : *
1040 : * Perform a basic sanity check to catch this case now.
1041 : */
695 pg 1042 GIC 9794 : if (!(postingoff > 0 && postingoff < nhtids))
695 pg 1043 UIC 0 : elog(ERROR, "posting list tuple with %d items cannot be split at offset %d",
1044 : nhtids, postingoff);
1045 :
1138 pg 1046 ECB : /*
1138 pg 1047 EUB : * Move item pointers in posting list to make a gap for the new item's
1048 : * heap TID. We shift TIDs one place to the right, losing original
1049 : * rightmost TID. (nmovebytes must not include TIDs to the left of
1050 : * postingoff, nor the existing rightmost/max TID that gets overwritten.)
1051 : */
1138 pg 1052 GIC 9794 : nposting = CopyIndexTuple(oposting);
1053 9794 : replacepos = (char *) BTreeTupleGetPostingN(nposting, postingoff);
1054 9794 : replaceposright = (char *) BTreeTupleGetPostingN(nposting, postingoff + 1);
1055 9794 : nmovebytes = (nhtids - postingoff - 1) * sizeof(ItemPointerData);
1138 pg 1056 CBC 9794 : memmove(replaceposright, replacepos, nmovebytes);
1138 pg 1057 ECB :
1058 : /* Fill the gap at postingoff with TID of new item (original new TID) */
1138 pg 1059 CBC 9794 : Assert(!BTreeTupleIsPivot(newitem) && !BTreeTupleIsPosting(newitem));
1060 9794 : ItemPointerCopy(&newitem->t_tid, (ItemPointer) replacepos);
1061 :
1062 : /* Now copy oposting's rightmost/max TID into new item (final new TID) */
1063 9794 : ItemPointerCopy(BTreeTupleGetMaxHeapTID(oposting), &newitem->t_tid);
1138 pg 1064 ECB :
1138 pg 1065 GIC 9794 : Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(nposting),
1066 : BTreeTupleGetHeapTID(newitem)) < 0);
1138 pg 1067 CBC 9794 : Assert(_bt_posting_valid(nposting));
1068 :
1069 9794 : return nposting;
1070 : }
1138 pg 1071 ECB :
1072 : /*
1073 : * Verify posting list invariants for "posting", which must be a posting list
1074 : * tuple. Used within assertions.
1075 : */
1076 : #ifdef USE_ASSERT_CHECKING
1077 : static bool
1138 pg 1078 GIC 654436 : _bt_posting_valid(IndexTuple posting)
1079 : {
1080 : ItemPointerData last;
1081 : ItemPointer htid;
1138 pg 1082 ECB :
1138 pg 1083 GIC 654436 : if (!BTreeTupleIsPosting(posting) || BTreeTupleGetNPosting(posting) < 2)
1138 pg 1084 UIC 0 : return false;
1085 :
1086 : /* Remember first heap TID for loop */
1138 pg 1087 CBC 654436 : ItemPointerCopy(BTreeTupleGetHeapTID(posting), &last);
1138 pg 1088 GBC 654436 : if (!ItemPointerIsValid(&last))
1138 pg 1089 UIC 0 : return false;
1090 :
1138 pg 1091 ECB : /* Iterate, starting from second TID */
1138 pg 1092 CBC 7657174 : for (int i = 1; i < BTreeTupleGetNPosting(posting); i++)
1138 pg 1093 EUB : {
1138 pg 1094 GIC 7002738 : htid = BTreeTupleGetPostingN(posting, i);
1095 :
1138 pg 1096 CBC 7002738 : if (!ItemPointerIsValid(htid))
1138 pg 1097 UIC 0 : return false;
1138 pg 1098 CBC 7002738 : if (ItemPointerCompare(htid, &last) <= 0)
1138 pg 1099 UIC 0 : return false;
1138 pg 1100 CBC 7002738 : ItemPointerCopy(htid, &last);
1138 pg 1101 EUB : }
1138 pg 1102 ECB :
1138 pg 1103 GBC 654436 : return true;
1138 pg 1104 ECB : }
1105 : #endif
|