Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtinsert.c
4 : * Item insertion in Lehman and Yao btrees for Postgres.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtinsert.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/nbtxlog.h"
20 : #include "access/transam.h"
21 : #include "access/xloginsert.h"
22 : #include "common/pg_prng.h"
23 : #include "lib/qunique.h"
24 : #include "miscadmin.h"
25 : #include "storage/lmgr.h"
26 : #include "storage/predicate.h"
27 : #include "storage/smgr.h"
28 :
29 : /* Minimum tree height for application of fastpath optimization */
30 : #define BTREE_FASTPATH_MIN_LEVEL 2
31 :
32 :
33 : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
34 : BTInsertState insertstate);
35 : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
36 : Relation heapRel,
37 : IndexUniqueCheck checkUnique, bool *is_unique,
38 : uint32 *speculativeToken);
39 : static OffsetNumber _bt_findinsertloc(Relation rel,
40 : BTInsertState insertstate,
41 : bool checkingunique,
42 : bool indexUnchanged,
43 : BTStack stack,
44 : Relation heapRel);
45 : static void _bt_stepright(Relation rel, Relation heaprel,
46 : BTInsertState insertstate, BTStack stack);
47 : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
48 : Buffer buf,
49 : Buffer cbuf,
50 : BTStack stack,
51 : IndexTuple itup,
52 : Size itemsz,
53 : OffsetNumber newitemoff,
54 : int postingoff,
55 : bool split_only_page);
56 : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
57 : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
58 : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
59 : IndexTuple nposting, uint16 postingoff);
60 : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
61 : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
62 : static Buffer _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
63 : static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
64 : OffsetNumber itup_off, bool newfirstdataitem);
65 : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
66 : BTInsertState insertstate,
67 : bool simpleonly, bool checkingunique,
68 : bool uniquedup, bool indexUnchanged);
69 : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
70 : OffsetNumber *deletable, int ndeletable,
71 : IndexTuple newitem, OffsetNumber minoff,
72 : OffsetNumber maxoff);
73 : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
74 : int ndeletable, IndexTuple newitem,
75 : int *nblocks);
76 : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
77 :
78 : /*
79 : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
80 : *
81 : * This routine is called by the public interface routine, btinsert.
82 : * By here, itup is filled in, including the TID.
83 : *
84 : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
85 : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
86 : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
87 : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
88 : * don't actually insert.
89 : *
90 : * indexUnchanged executor hint indicates if itup is from an
91 : * UPDATE that didn't logically change the indexed value, but
92 : * must nevertheless have a new entry to point to a successor
93 : * version.
94 : *
95 : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
96 : * it must be true if the entry is known unique, else false.
97 : * (In the current implementation we'll also return true after a
98 : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
99 : * that's just a coding artifact.)
100 : */
101 : bool
6283 tgl 102 GIC 7167159 : _bt_doinsert(Relation rel, IndexTuple itup,
103 : IndexUniqueCheck checkUnique, bool indexUnchanged,
816 pg 104 ECB : Relation heapRel)
105 : {
5002 tgl 106 GIC 7167159 : bool is_unique = false;
107 : BTInsertStateData insertstate;
1481 pg 108 ECB : BTScanInsert itup_key;
109 : BTStack stack;
1481 pg 110 GIC 7167159 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111 :
6291 tgl 112 ECB : /* we need an insertion scan key to do our search, so build one */
8 andres 113 GNC 7167159 : itup_key = _bt_mkscankey(rel, heapRel, itup);
114 :
1447 pg 115 CBC 7167159 : if (checkingunique)
116 : {
117 5249429 : if (!itup_key->anynullkeys)
118 : {
1447 pg 119 ECB : /* No (heapkeyspace) scantid until uniqueness established */
1447 pg 120 GIC 5249351 : itup_key->scantid = NULL;
121 : }
1447 pg 122 ECB : else
123 : {
124 : /*
125 : * Scan key for new tuple contains NULL key values. Bypass
126 : * checkingunique steps. They are unnecessary because core code
127 : * considers NULL unequal to every value, including NULL.
128 : *
129 : * This optimization avoids O(N^2) behavior within the
130 : * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 : * large number of "duplicates" with NULL key values.
132 : */
1447 pg 133 GIC 78 : checkingunique = false;
134 : /* Tuple is unique in the sense that core code cares about */
1447 pg 135 CBC 78 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
1447 pg 136 GIC 78 : is_unique = true;
1447 pg 137 ECB : }
138 : }
139 :
140 : /*
141 : * Fill in the BTInsertState working area, to track the current page and
142 : * position within the page to insert on.
143 : *
144 : * Note that itemsz is passed down to lower level code that deals with
145 : * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 : * accounting code consistently considers the alignment overhead that we
147 : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 : * already conservative about alignment, but we don't rely on that from
149 : * this distance. Besides, preserving the "true" tuple size in index
150 : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 : */
1481 pg 153 GIC 7167159 : insertstate.itup = itup;
154 7167159 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
1481 pg 155 CBC 7167159 : insertstate.itup_key = itup_key;
156 7167159 : insertstate.bounds_valid = false;
157 7167159 : insertstate.buf = InvalidBuffer;
1138 158 7167159 : insertstate.postingoff = 0;
9345 bruce 159 ECB :
1117 pg 160 CBC 7167171 : search:
161 :
1840 andrew 162 ECB : /*
163 : * Find and lock the leaf page that the tuple should be added to by
164 : * searching from the root page. insertstate.buf will hold a buffer that
165 : * is locked in exclusive mode afterwards.
166 : */
8 andres 167 GNC 7167171 : stack = _bt_search_insert(rel, heapRel, &insertstate);
168 :
8297 tgl 169 ECB : /*
170 : * checkingunique inserts are not allowed to go ahead when two tuples with
171 : * equal key attribute values would be visible to new MVCC snapshots once
172 : * the xact commits. Check for conflicts in the locked page/buffer (if
173 : * needed) here.
174 : *
175 : * It might be necessary to check a page to the right in _bt_check_unique,
176 : * though that should be very rare. In practice the first page the value
177 : * could be on (with scantid omitted) is almost always also the only page
178 : * that a matching tuple might be found on. This is due to the behavior
179 : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 : * only be allowed to cross a page boundary when there is no candidate
181 : * leaf page split point that avoids it. Also, _bt_check_unique can use
182 : * the leaf page high key to determine that there will be no duplicates on
183 : * the right sibling without actually visiting it (it uses the high key in
184 : * cases where the new item happens to belong at the far right of the leaf
185 : * page).
186 : *
187 : * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 : * in the index; so it cannot defend against concurrent insertions of the
189 : * same key. We protect against that by means of holding a write lock on
190 : * the first page the value could be on, with omitted/-inf value for the
191 : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 : * the same key must acquire a write lock on the same page, so only one
193 : * would-be inserter can be making the check at one time. Furthermore,
194 : * once we are past the check we hold write locks continuously until we
195 : * have performed our insertion, so no later inserter can fail to see our
196 : * insertion. (This requires some care in _bt_findinsertloc.)
197 : *
198 : * If we must wait for another xact, we release the lock while waiting,
199 : * and then must perform a new search.
200 : *
201 : * For a partial uniqueness check, we don't wait for the other xact. Just
202 : * let the tuple in and return false for possibly non-unique, or true for
203 : * definitely unique.
204 : */
1481 pg 205 GIC 7167171 : if (checkingunique)
206 : {
2878 bruce 207 ECB : TransactionId xwait;
208 : uint32 speculativeToken;
209 :
1481 pg 210 GIC 5249363 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 : &is_unique, &speculativeToken);
8297 tgl 212 ECB :
1117 pg 213 GIC 5249110 : if (unlikely(TransactionIdIsValid(xwait)))
214 : {
8297 tgl 215 ECB : /* Have to wait for the other guy ... */
1481 pg 216 GIC 14 : _bt_relbuf(rel, insertstate.buf);
217 14 : insertstate.buf = InvalidBuffer;
2878 bruce 218 ECB :
2893 andres 219 : /*
220 : * If it's a speculative insertion, wait for it to finish (ie. to
221 : * go ahead with the insertion, or kill the tuple). Otherwise
222 : * wait for the transaction to finish as usual.
223 : */
2893 andres 224 GIC 14 : if (speculativeToken)
2893 andres 225 UIC 0 : SpeculativeInsertionWait(xwait, speculativeToken);
2893 andres 226 ECB : else
2893 andres 227 GBC 14 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228 :
8297 tgl 229 ECB : /* start over... */
1840 andrew 230 GIC 12 : if (stack)
1840 andrew 231 UIC 0 : _bt_freestack(stack);
1117 pg 232 CBC 12 : goto search;
8297 tgl 233 EUB : }
1481 pg 234 ECB :
235 : /* Uniqueness is established -- restore heap tid as scantid */
1481 pg 236 GIC 5249096 : if (itup_key->heapkeyspace)
237 5249096 : itup_key->scantid = &itup->t_tid;
8297 tgl 238 ECB : }
239 :
5002 tgl 240 GIC 7166904 : if (checkUnique != UNIQUE_CHECK_EXISTING)
241 : {
1481 pg 242 ECB : OffsetNumber newitemoff;
243 :
244 : /*
245 : * The only conflict predicate locking cares about for indexes is when
246 : * an index tuple insert conflicts with an existing lock. We don't
247 : * know the actual page we're going to insert on for sure just yet in
248 : * checkingunique and !heapkeyspace cases, but it's okay to use the
249 : * first page the value could be on (with scantid omitted) instead.
250 : */
1167 tmunro 251 GIC 7166877 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
252 :
1481 pg 253 ECB : /*
254 : * Do the insertion. Note that insertstate contains cached binary
255 : * search bounds established within _bt_check_unique when insertion is
256 : * checkingunique.
257 : */
1481 pg 258 GIC 7166874 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 : indexUnchanged, stack, heapRel);
8 andres 260 GNC 7166874 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 : stack, itup, insertstate.itemsz, newitemoff,
1119 pg 262 ECB : insertstate.postingoff, false);
263 : }
264 : else
265 : {
266 : /* just release the buffer */
1481 pg 267 GIC 27 : _bt_relbuf(rel, insertstate.buf);
268 : }
8297 tgl 269 ECB :
270 : /* be tidy */
1840 andrew 271 GIC 7166901 : if (stack)
272 5665174 : _bt_freestack(stack);
1481 pg 273 CBC 7166901 : pfree(itup_key);
5002 tgl 274 ECB :
5002 tgl 275 CBC 7166901 : return is_unique;
276 : }
8297 tgl 277 ECB :
278 : /*
279 : * _bt_search_insert() -- _bt_search() wrapper for inserts
280 : *
281 : * Search the tree for a particular scankey, or more precisely for the first
282 : * leaf page it could be on. Try to make use of the fastpath optimization's
283 : * rightmost leaf page cache before actually searching the tree from the root
284 : * page, though.
285 : *
286 : * Return value is a stack of parent-page pointers (though see notes about
287 : * fastpath optimization and page splits below). insertstate->buf is set to
288 : * the address of the leaf-page buffer, which is write-locked and pinned in
289 : * all cases (if necessary by creating a new empty root page for caller).
290 : *
291 : * The fastpath optimization avoids most of the work of searching the tree
292 : * repeatedly when a single backend inserts successive new tuples on the
293 : * rightmost leaf page of an index. A backend cache of the rightmost leaf
294 : * page is maintained within _bt_insertonpg(), and used here. The cache is
295 : * invalidated here when an insert of a non-pivot tuple must take place on a
296 : * non-rightmost leaf page.
297 : *
298 : * The optimization helps with indexes on an auto-incremented field. It also
299 : * helps with indexes on datetime columns, as well as indexes with lots of
300 : * NULL values. (NULLs usually get inserted in the rightmost page for single
301 : * column indexes, since they usually get treated as coming after everything
302 : * else in the key space. Individual NULL tuples will generally be placed on
303 : * the rightmost leaf page due to the influence of the heap TID column.)
304 : *
305 : * Note that we avoid applying the optimization when there is insufficient
306 : * space on the rightmost page to fit caller's new item. This is necessary
307 : * because we'll need to return a real descent stack when a page split is
308 : * expected (actually, caller can cope with a leaf page split that uses a NULL
309 : * stack, but that's very slow and so must be avoided). Note also that the
310 : * fastpath optimization acquires the lock on the page conditionally as a way
311 : * of reducing extra contention when there are concurrent insertions into the
312 : * rightmost page (we give up if we'd have to wait for the lock). We assume
313 : * that it isn't useful to apply the optimization when there is contention,
314 : * since each per-backend cache won't stay valid for long.
315 : */
316 : static BTStack
8 andres 317 GNC 7167171 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
318 : {
1117 pg 319 CBC 7167171 : Assert(insertstate->buf == InvalidBuffer);
1117 pg 320 GIC 7167171 : Assert(!insertstate->bounds_valid);
1117 pg 321 CBC 7167171 : Assert(insertstate->postingoff == 0);
1117 pg 322 ECB :
1117 pg 323 CBC 7167171 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
324 : {
1117 pg 325 ECB : /* Simulate a _bt_getbuf() call with conditional locking */
1117 pg 326 GIC 34265 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
992 327 34265 : if (_bt_conditionallockbuf(rel, insertstate->buf))
1117 pg 328 ECB : {
329 : Page page;
330 : BTPageOpaque opaque;
331 :
1117 pg 332 GIC 34019 : _bt_checkpage(rel, insertstate->buf);
333 34019 : page = BufferGetPage(insertstate->buf);
373 michael 334 CBC 34019 : opaque = BTPageGetOpaque(page);
1117 pg 335 ECB :
336 : /*
337 : * Check if the page is still the rightmost leaf page and has
338 : * enough free space to accommodate the new tuple. Also check
339 : * that the insertion scan key is strictly greater than the first
340 : * non-pivot tuple on the page. (Note that we expect itup_key's
341 : * scantid to be unset when our caller is a checkingunique
342 : * inserter.)
343 : */
873 pg 344 GIC 34019 : if (P_RIGHTMOST(opaque) &&
345 33973 : P_ISLEAF(opaque) &&
873 pg 346 CBC 33973 : !P_IGNORE(opaque) &&
1117 347 67742 : PageGetFreeSpace(page) > insertstate->itemsz &&
348 67538 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
349 33769 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
1117 pg 350 ECB : {
351 : /*
352 : * Caller can use the fastpath optimization because cached
353 : * block is still rightmost leaf page, which can fit caller's
354 : * new tuple without splitting. Keep block in local cache for
355 : * next insert, and have caller use NULL stack.
356 : *
357 : * Note that _bt_insert_parent() has an assertion that catches
358 : * leaf page splits that somehow follow from a fastpath insert
359 : * (it should only be passed a NULL stack when it must deal
360 : * with a concurrent root page split, and never because a NULL
361 : * stack was returned here).
362 : */
1117 pg 363 GIC 33747 : return NULL;
364 : }
1117 pg 365 ECB :
366 : /* Page unsuitable for caller, drop lock and pin */
1117 pg 367 GIC 272 : _bt_relbuf(rel, insertstate->buf);
368 : }
1117 pg 369 ECB : else
370 : {
371 : /* Lock unavailable, drop pin */
1117 pg 372 GIC 246 : ReleaseBuffer(insertstate->buf);
373 : }
1117 pg 374 ECB :
375 : /* Forget block, since cache doesn't appear to be useful */
1117 pg 376 GIC 518 : RelationSetTargetBlock(rel, InvalidBlockNumber);
377 : }
1117 pg 378 ECB :
379 : /* Cannot use optimization -- descend tree, return proper descent stack */
8 andres 380 GNC 7133424 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
381 : BT_WRITE, NULL);
1117 pg 382 ECB : }
383 :
384 : /*
385 : * _bt_check_unique() -- Check for violation of unique index constraint
386 : *
387 : * Returns InvalidTransactionId if there is no conflict, else an xact ID
388 : * we must wait for to see if it commits a conflicting tuple. If an actual
389 : * conflict is detected, no return --- just ereport(). If an xact ID is
390 : * returned, and the conflicting tuple still has a speculative insertion in
391 : * progress, *speculativeToken is set to non-zero, and the caller can wait for
392 : * the verdict on the insertion using SpeculativeInsertionWait().
393 : *
394 : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
395 : * InvalidTransactionId because we don't want to wait. In this case we
396 : * set *is_unique to false if there is a potential conflict, and the
397 : * core code must redo the uniqueness check later.
398 : *
399 : * As a side-effect, sets state in insertstate that can later be used by
400 : * _bt_findinsertloc() to reuse most of the binary search work we do
401 : * here.
402 : *
403 : * This code treats NULLs as equal, unlike the default semantics for unique
404 : * indexes. So do not call here when there are NULL values in scan key and
405 : * the index uses the default NULLS DISTINCT mode.
406 : */
407 : static TransactionId
1481 pg 408 GIC 5249363 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
409 : IndexUniqueCheck checkUnique, bool *is_unique,
2893 andres 410 ECB : uint32 *speculativeToken)
411 : {
1481 pg 412 GIC 5249363 : IndexTuple itup = insertstate->itup;
1030 413 5249363 : IndexTuple curitup = NULL;
731 pg 414 CBC 5249363 : ItemId curitemid = NULL;
1481 415 5249363 : BTScanInsert itup_key = insertstate->itup_key;
5859 tgl 416 ECB : SnapshotData SnapshotDirty;
1481 pg 417 : OffsetNumber offset;
418 : OffsetNumber maxoff;
419 : Page page;
420 : BTPageOpaque opaque;
8297 tgl 421 GIC 5249363 : Buffer nbuf = InvalidBuffer;
5002 422 5249363 : bool found = false;
1138 pg 423 CBC 5249363 : bool inposting = false;
424 5249363 : bool prevalldead = true;
425 5249363 : int curposti = 0;
5002 tgl 426 ECB :
427 : /* Assume unique until we find a duplicate */
5002 tgl 428 GIC 5249363 : *is_unique = true;
429 :
5859 tgl 430 CBC 5249363 : InitDirtySnapshot(SnapshotDirty);
431 :
1481 pg 432 5249363 : page = BufferGetPage(insertstate->buf);
373 michael 433 GIC 5249363 : opaque = BTPageGetOpaque(page);
8297 tgl 434 CBC 5249363 : maxoff = PageGetMaxOffsetNumber(page);
8297 tgl 435 ECB :
1481 pg 436 : /*
437 : * Find the first tuple with the same key.
438 : *
439 : * This also saves the binary search bounds in insertstate. We use them
440 : * in the fastpath below, but also in the _bt_findinsertloc() call later.
441 : */
1466 pg 442 GIC 5249363 : Assert(!insertstate->bounds_valid);
1481 443 5249363 : offset = _bt_binsrch_insert(rel, insertstate);
1481 pg 444 ECB :
8297 tgl 445 : /*
446 : * Scan over all equal tuples, looking for live conflicts.
447 : */
1481 pg 448 GIC 5249363 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
1447 449 5249363 : Assert(!itup_key->anynullkeys);
1481 pg 450 CBC 5249363 : Assert(itup_key->scantid == NULL);
8297 tgl 451 ECB : for (;;)
452 : {
453 : /*
454 : * Each iteration of the loop processes one heap TID, not one index
455 : * tuple. Current offset number for page isn't usually advanced on
456 : * iterations that process heap TIDs from posting list tuples.
457 : *
458 : * "inposting" state is set when _inside_ a posting list --- not when
459 : * we're at the start (or end) of a posting list. We advance curposti
460 : * at the end of the iteration when inside a posting list tuple. In
461 : * general, every loop iteration either advances the page offset or
462 : * advances curposti --- an iteration that handles the rightmost/max
463 : * heap TID in a posting list finally advances the page offset (and
464 : * unsets "inposting").
465 : *
466 : * Make sure the offset points to an actual index tuple before trying
467 : * to examine it...
468 : */
8297 tgl 469 GIC 11286194 : if (offset <= maxoff)
470 : {
1481 pg 471 ECB : /*
472 : * Fastpath: In most cases, we can use cached search bounds to
473 : * limit our consideration to items that are definitely
474 : * duplicates. This fastpath doesn't apply when the original page
475 : * is empty, or when initial offset is past the end of the
476 : * original page, which may indicate that we need to examine a
477 : * second or subsequent page.
478 : *
479 : * Note that this optimization allows us to avoid calling
480 : * _bt_compare() directly when there are no duplicates, as long as
481 : * the offset where the key will go is not at the end of the page.
482 : */
1481 pg 483 GIC 8446167 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
484 : {
1481 pg 485 CBC 2256132 : Assert(insertstate->bounds_valid);
1481 pg 486 GIC 2256132 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
1481 pg 487 CBC 2256132 : Assert(insertstate->low <= insertstate->stricthigh);
1447 488 2256132 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
1481 489 2256132 : break;
1481 pg 490 ECB : }
491 :
492 : /*
493 : * We can skip items that are already marked killed.
494 : *
495 : * In the presence of heavy update activity an index may contain
496 : * many killed items with the same key; running _bt_compare() on
497 : * each killed item gets expensive. Just advance over killed
498 : * items as quickly as we can. We only apply _bt_compare() when
499 : * we get to a non-killed item. We could reuse the bounds to
500 : * avoid _bt_compare() calls for known equal tuples, but it
501 : * doesn't seem worth it.
502 : */
1138 pg 503 GIC 6190035 : if (!inposting)
504 3933758 : curitemid = PageGetItemId(page, offset);
1138 pg 505 CBC 6190035 : if (inposting || !ItemIdIsDead(curitemid))
8297 tgl 506 ECB : {
5680 507 : ItemPointerData htid;
1138 pg 508 GIC 5955716 : bool all_dead = false;
509 :
1138 pg 510 CBC 5955716 : if (!inposting)
511 : {
1138 pg 512 ECB : /* Plain tuple, or first TID in posting list tuple */
1138 pg 513 GIC 3699439 : if (_bt_compare(rel, itup_key, page, offset) != 0)
514 138699 : break; /* we're past all the equal tuples */
7159 tgl 515 ECB :
1138 pg 516 : /* Advanced curitup */
1138 pg 517 GIC 3560740 : curitup = (IndexTuple) PageGetItem(page, curitemid);
518 3560740 : Assert(!BTreeTupleIsPivot(curitup));
1138 pg 519 ECB : }
520 :
521 : /* okay, we gotta fetch the heap tuple using htid ... */
1138 pg 522 GIC 5817017 : if (!BTreeTupleIsPosting(curitup))
523 : {
1138 pg 524 ECB : /* ... htid is from simple non-pivot tuple */
1138 pg 525 GIC 3538601 : Assert(!inposting);
526 3538601 : htid = curitup->t_tid;
1138 pg 527 ECB : }
1138 pg 528 CBC 2278416 : else if (!inposting)
529 : {
1138 pg 530 ECB : /* ... htid is first TID in new posting list */
1138 pg 531 GIC 22139 : inposting = true;
532 22139 : prevalldead = true;
1138 pg 533 CBC 22139 : curposti = 0;
534 22139 : htid = *BTreeTupleGetPostingN(curitup, 0);
1138 pg 535 ECB : }
536 : else
537 : {
538 : /* ... htid is second or subsequent TID in posting list */
1138 pg 539 GIC 2256277 : Assert(curposti > 0);
540 2256277 : htid = *BTreeTupleGetPostingN(curitup, curposti);
1138 pg 541 ECB : }
5680 tgl 542 :
543 : /*
544 : * If we are doing a recheck, we expect to find the tuple we
545 : * are rechecking. It's not a duplicate, but we have to keep
546 : * scanning.
547 : */
5002 tgl 548 GIC 5817118 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
549 101 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
5002 tgl 550 ECB : {
5002 tgl 551 CBC 27 : found = true;
552 : }
5002 tgl 553 ECB :
554 : /*
555 : * Check if there's any table tuples for this index entry
556 : * satisfying SnapshotDirty. This is necessary because for AMs
557 : * with optimizations like heap's HOT, we have just a single
558 : * index entry for the entire chain.
559 : */
1476 andres 560 GIC 5816990 : else if (table_index_fetch_tuple_check(heapRel, &htid,
561 : &SnapshotDirty,
1476 andres 562 ECB : &all_dead))
563 : {
564 : TransactionId xwait;
565 :
566 : /*
567 : * It is a duplicate. If we are only doing a partial
568 : * check, then don't bother checking if the tuple is being
569 : * updated in another transaction. Just return the fact
570 : * that it is a potential conflict and leave the full
571 : * check till later. Don't invalidate binary search
572 : * bounds.
573 : */
5002 tgl 574 GIC 321 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
575 : {
5002 tgl 576 CBC 54 : if (nbuf != InvalidBuffer)
5002 tgl 577 UIC 0 : _bt_relbuf(rel, nbuf);
5002 tgl 578 CBC 54 : *is_unique = false;
5002 tgl 579 GBC 68 : return InvalidTransactionId;
5002 tgl 580 ECB : }
8297 581 :
582 : /*
583 : * If this tuple is being updated by other transaction
584 : * then we have to wait for its commit/abort.
585 : */
5002 tgl 586 GIC 534 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
587 267 : SnapshotDirty.xmin : SnapshotDirty.xmax;
5002 tgl 588 ECB :
7625 tgl 589 CBC 267 : if (TransactionIdIsValid(xwait))
590 : {
591 14 : if (nbuf != InvalidBuffer)
7625 tgl 592 UIC 0 : _bt_relbuf(rel, nbuf);
7625 tgl 593 ECB : /* Tell _bt_doinsert to wait... */
2893 andres 594 GBC 14 : *speculativeToken = SnapshotDirty.speculativeToken;
595 : /* Caller releases lock on buf immediately */
1466 pg 596 CBC 14 : insertstate->bounds_valid = false;
7625 tgl 597 GIC 14 : return xwait;
7625 tgl 598 ECB : }
8881 vadim4o 599 :
600 : /*
601 : * Otherwise we have a definite conflict. But before
602 : * complaining, look to see if the tuple we want to insert
603 : * is itself now committed dead --- if so, don't complain.
604 : * This is a waste of time in normal scenarios but we must
605 : * do it to support CREATE INDEX CONCURRENTLY.
606 : *
607 : * We must follow HOT-chains here because during
608 : * concurrent index build, we insert the root TID though
609 : * the actual tuple may be somewhere in the HOT-chain.
610 : * While following the chain we might not stop at the
611 : * exact tuple which triggered the insert, but that's OK
612 : * because if we find a live tuple anywhere in this chain,
613 : * we have a unique key conflict. The other live tuple is
614 : * not part of this chain because it had a different index
615 : * entry.
616 : */
1018 pg 617 GIC 253 : htid = itup->t_tid;
618 253 : if (table_index_fetch_tuple_check(heapRel, &htid,
1476 andres 619 ECB : SnapshotSelf, NULL))
6071 tgl 620 : {
621 : /* Normal case --- it's still live */
622 : }
623 : else
624 : {
625 : /*
626 : * It's been deleted, so no error, and no need to
627 : * continue searching
628 : */
6071 tgl 629 UIC 0 : break;
630 : }
6071 tgl 631 EUB :
632 : /*
633 : * Check for a conflict-in as we would if we were going to
634 : * write to this page. We aren't actually going to write,
635 : * but we want a chance to report SSI conflicts that would
636 : * otherwise be masked by this unique constraint
637 : * violation.
638 : */
1167 tmunro 639 GIC 253 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
640 :
5002 tgl 641 ECB : /*
642 : * This is a definite conflict. Break the tuple down into
643 : * datums and report the error. But first, make sure we
644 : * release the buffer locks we're holding ---
645 : * BuildIndexValueDescription could make catalog accesses,
646 : * which in the worst case might touch this same index and
647 : * cause deadlocks.
648 : */
4999 tgl 649 GIC 249 : if (nbuf != InvalidBuffer)
4999 tgl 650 UIC 0 : _bt_relbuf(rel, nbuf);
1481 pg 651 CBC 249 : _bt_relbuf(rel, insertstate->buf);
1481 pg 652 GBC 249 : insertstate->buf = InvalidBuffer;
1466 pg 653 CBC 249 : insertstate->bounds_valid = false;
4999 tgl 654 ECB :
655 : {
656 : Datum values[INDEX_MAX_KEYS];
657 : bool isnull[INDEX_MAX_KEYS];
658 : char *key_desc;
659 :
4999 tgl 660 GIC 249 : index_deform_tuple(itup, RelationGetDescr(rel),
661 : values, isnull);
3009 sfrost 662 ECB :
3009 sfrost 663 GIC 249 : key_desc = BuildIndexValueDescription(rel, values,
664 : isnull);
3009 sfrost 665 ECB :
4999 tgl 666 GIC 249 : ereport(ERROR,
667 : (errcode(ERRCODE_UNIQUE_VIOLATION),
4999 tgl 668 ECB : errmsg("duplicate key value violates unique constraint \"%s\"",
669 : RelationGetRelationName(rel)),
670 : key_desc ? errdetail("Key %s already exists.",
671 : key_desc) : 0,
672 : errtableconstraint(heapRel,
673 : RelationGetRelationName(rel))));
674 : }
675 : }
1138 pg 676 GIC 5816669 : else if (all_dead && (!inposting ||
677 14368 : (prevalldead &&
1138 pg 678 CBC 14368 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
7625 tgl 679 ECB : {
680 : /*
681 : * The conflicting tuple (or all HOT chains pointed to by
682 : * all posting list TIDs) is dead to everyone, so mark the
683 : * index entry killed.
684 : */
5680 tgl 685 GIC 47834 : ItemIdMarkDead(curitemid);
686 47834 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
3670 simon 687 ECB :
688 : /*
689 : * Mark buffer with a dirty hint, since state is not
690 : * crucial. Be sure to mark the proper buffer dirty.
691 : */
5680 tgl 692 GIC 47834 : if (nbuf != InvalidBuffer)
3583 jdavis 693 3 : MarkBufferDirtyHint(nbuf, true);
5680 tgl 694 ECB : else
1481 pg 695 CBC 47831 : MarkBufferDirtyHint(insertstate->buf, true);
696 : }
1138 pg 697 ECB :
698 : /*
699 : * Remember if posting list tuple has even a single HOT chain
700 : * whose members are not all dead
701 : */
1138 pg 702 GIC 5816696 : if (!all_dead && inposting)
703 2263817 : prevalldead = false;
9585 vadim4o 704 ECB : }
8297 tgl 705 : }
706 :
1138 pg 707 GIC 8891042 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
708 : {
1138 pg 709 ECB : /* Advance to next TID in same posting list */
1138 pg 710 GIC 2256277 : curposti++;
711 2256277 : continue;
1138 pg 712 ECB : }
1138 pg 713 CBC 6634765 : else if (offset < maxoff)
714 : {
1138 pg 715 ECB : /* Advance to next tuple */
1138 pg 716 GIC 3775532 : curposti = 0;
717 3775532 : inposting = false;
8297 tgl 718 CBC 3775532 : offset = OffsetNumberNext(offset);
1138 pg 719 ECB : }
8297 tgl 720 : else
721 : {
722 : int highkeycmp;
723 :
724 : /* If scankey == hikey we gotta check the next page too */
8297 tgl 725 GIC 2859233 : if (P_RIGHTMOST(opaque))
726 2782665 : break;
1481 pg 727 CBC 76568 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
728 76568 : Assert(highkeycmp <= 0);
729 76568 : if (highkeycmp != 0)
8297 tgl 730 71546 : break;
7351 tgl 731 ECB : /* Advance to next non-dead page --- there must be one */
732 : for (;;)
7351 tgl 733 UIC 0 : {
1138 pg 734 GIC 5022 : BlockNumber nblkno = opaque->btpo_next;
1138 pg 735 EUB :
6927 tgl 736 CBC 5022 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
2545 kgrittn 737 GIC 5022 : page = BufferGetPage(nbuf);
373 michael 738 CBC 5022 : opaque = BTPageGetOpaque(page);
7351 tgl 739 5022 : if (!P_IGNORE(opaque))
740 5022 : break;
7351 tgl 741 LBC 0 : if (P_RIGHTMOST(opaque))
5578 742 0 : elog(ERROR, "fell off the end of index \"%s\"",
7351 tgl 743 EUB : RelationGetRelationName(rel));
744 : }
745 : /* Will also advance to next tuple */
1138 pg 746 GIC 5022 : curposti = 0;
747 5022 : inposting = false;
8297 tgl 748 CBC 5022 : maxoff = PageGetMaxOffsetNumber(page);
749 5022 : offset = P_FIRSTDATAKEY(opaque);
1466 pg 750 ECB : /* Don't invalidate binary search bounds */
9345 bruce 751 : }
752 : }
753 :
754 : /*
755 : * If we are doing a recheck then we should have found the tuple we are
756 : * checking. Otherwise there's something very wrong --- probably, the
757 : * index is on a non-immutable expression.
758 : */
5002 tgl 759 GIC 5249042 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
5002 tgl 760 UIC 0 : ereport(ERROR,
5002 tgl 761 ECB : (errcode(ERRCODE_INTERNAL_ERROR),
5002 tgl 762 EUB : errmsg("failed to re-find tuple within index \"%s\"",
763 : RelationGetRelationName(rel)),
764 : errhint("This may be because of a non-immutable index expression."),
765 : errtableconstraint(heapRel,
766 : RelationGetRelationName(rel))));
767 :
8297 tgl 768 GIC 5249042 : if (nbuf != InvalidBuffer)
7938 769 2880 : _bt_relbuf(rel, nbuf);
9345 bruce 770 ECB :
7899 tgl 771 CBC 5249042 : return InvalidTransactionId;
772 : }
9770 scrappy 773 ECB :
774 :
775 : /*
776 : * _bt_findinsertloc() -- Finds an insert location for a tuple
777 : *
778 : * On entry, insertstate buffer contains the page the new tuple belongs
779 : * on. It is exclusive-locked and pinned by the caller.
780 : *
781 : * If 'checkingunique' is true, the buffer on entry is the first page
782 : * that contains duplicates of the new key. If there are duplicates on
783 : * multiple pages, the correct insertion position might be some page to
784 : * the right, rather than the first page. In that case, this function
785 : * moves right to the correct target page.
786 : *
787 : * (In a !heapkeyspace index, there can be multiple pages with the same
788 : * high key, where the new tuple could legitimately be placed on. In
789 : * that case, the caller passes the first page containing duplicates,
790 : * just like when checkingunique=true. If that page doesn't have enough
791 : * room for the new tuple, this function moves right, trying to find a
792 : * legal page that does.)
793 : *
794 : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
795 : * logically change the indexed value, but must nevertheless have a new
796 : * entry to point to a successor version. This hint from the executor
797 : * will influence our behavior when the page might have to be split and
798 : * we must consider our options. Bottom-up index deletion can avoid
799 : * pathological version-driven page splits, but we only want to go to the
800 : * trouble of trying it when we already have moderate confidence that
801 : * it's appropriate. The hint should not significantly affect our
802 : * behavior over time unless practically all inserts on to the leaf page
803 : * get the hint.
804 : *
805 : * On exit, insertstate buffer contains the chosen insertion page, and
806 : * the offset within that page is returned. If _bt_findinsertloc needed
807 : * to move right, the lock and pin on the original page are released, and
808 : * the new buffer is exclusively locked and pinned instead.
809 : *
810 : * If insertstate contains cached binary search bounds, we will take
811 : * advantage of them. This avoids repeating comparisons that we made in
812 : * _bt_check_unique() already.
813 : */
814 : static OffsetNumber
5881 bruce 815 GIC 7166874 : _bt_findinsertloc(Relation rel,
816 : BTInsertState insertstate,
1481 pg 817 ECB : bool checkingunique,
818 : bool indexUnchanged,
819 : BTStack stack,
820 : Relation heapRel)
821 : {
1481 pg 822 GIC 7166874 : BTScanInsert itup_key = insertstate->itup_key;
823 7166874 : Page page = BufferGetPage(insertstate->buf);
873 pg 824 ECB : BTPageOpaque opaque;
1138 825 : OffsetNumber newitemoff;
826 :
373 michael 827 GIC 7166874 : opaque = BTPageGetOpaque(page);
828 :
1481 pg 829 ECB : /* Check 1/3 of a page restriction */
1481 pg 830 GIC 7166874 : if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
1481 pg 831 UIC 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
1481 pg 832 ECB : insertstate->itup);
8505 tgl 833 EUB :
873 pg 834 GIC 7166874 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
1481 835 7166874 : Assert(!insertstate->bounds_valid || checkingunique);
1481 pg 836 CBC 7166874 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
837 7166874 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
1138 838 7166874 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
8262 tgl 839 ECB :
1481 pg 840 CBC 7166874 : if (itup_key->heapkeyspace)
841 : {
1138 pg 842 ECB : /* Keep track of whether checkingunique duplicate seen */
816 pg 843 GIC 7166874 : bool uniquedup = indexUnchanged;
844 :
5881 bruce 845 ECB : /*
846 : * If we're inserting into a unique index, we may have to walk right
847 : * through leaf pages to find the one leaf page that we must insert on
848 : * to.
849 : *
850 : * This is needed for checkingunique callers because a scantid was not
851 : * used when we called _bt_search(). scantid can only be set after
852 : * _bt_check_unique() has checked for duplicates. The buffer
853 : * initially stored in insertstate->buf has the page where the first
854 : * duplicate key might be found, which isn't always the page that new
855 : * tuple belongs on. The heap TID attribute for new tuple (scantid)
856 : * could force us to insert on a sibling page, though that should be
857 : * very rare in practice.
858 : */
1481 pg 859 GIC 7166874 : if (checkingunique)
860 : {
1138 pg 861 CBC 5249066 : if (insertstate->low < insertstate->stricthigh)
862 : {
1138 pg 863 ECB : /* Encountered a duplicate in _bt_check_unique() */
1138 pg 864 GIC 317864 : Assert(insertstate->bounds_valid);
865 317864 : uniquedup = true;
1138 pg 866 ECB : }
867 :
868 : for (;;)
869 : {
870 : /*
871 : * Does the new tuple belong on this page?
872 : *
873 : * The earlier _bt_check_unique() call may well have
874 : * established a strict upper bound on the offset for the new
875 : * item. If it's not the last item of the page (i.e. if there
876 : * is at least one tuple on the page that goes after the tuple
877 : * we're inserting) then we know that the tuple belongs on
878 : * this page. We can skip the high key check.
879 : */
1481 pg 880 GIC 5254088 : if (insertstate->bounds_valid &&
881 10481378 : insertstate->low <= insertstate->stricthigh &&
1481 pg 882 CBC 5240689 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
883 2380612 : break;
1481 pg 884 ECB :
885 : /* Test '<=', not '!=', since scantid is set now */
873 pg 886 GIC 2957725 : if (P_RIGHTMOST(opaque) ||
1481 887 84249 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
1481 pg 888 ECB : break;
6102 tgl 889 :
8 andres 890 GNC 5022 : _bt_stepright(rel, heapRel, insertstate, stack);
891 : /* Update local state after stepping right */
1481 pg 892 CBC 5022 : page = BufferGetPage(insertstate->buf);
373 michael 893 GIC 5022 : opaque = BTPageGetOpaque(page);
1138 pg 894 ECB : /* Assume duplicates (if checkingunique) */
1138 pg 895 CBC 5022 : uniquedup = true;
896 : }
9345 bruce 897 ECB : }
898 :
899 : /*
900 : * If the target page cannot fit newitem, try to avoid splitting the
901 : * page on insert by performing deletion or deduplication now
902 : */
1138 pg 903 GIC 7166874 : if (PageGetFreeSpace(page) < insertstate->itemsz)
873 904 50681 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
816 pg 905 ECB : checkingunique, uniquedup,
906 : indexUnchanged);
907 : }
908 : else
909 : {
910 : /*----------
911 : * This is a !heapkeyspace (version 2 or 3) index. The current page
912 : * is the first page that we could insert the new tuple to, but there
913 : * may be other pages to the right that we could opt to use instead.
914 : *
915 : * If the new key is equal to one or more existing keys, we can
916 : * legitimately place it anywhere in the series of equal keys. In
917 : * fact, if the new key is equal to the page's "high key" we can place
918 : * it on the next page. If it is equal to the high key, and there's
919 : * not room to insert the new tuple on the current page without
920 : * splitting, then we move right hoping to find more free space and
921 : * avoid a split.
922 : *
923 : * Keep scanning right until we
924 : * (a) find a page with enough free space,
925 : * (b) reach the last page where the tuple can legally go, or
926 : * (c) get tired of searching.
927 : * (c) is not flippant; it is important because if there are many
928 : * pages' worth of equal keys, it's better to split one of the early
929 : * pages than to scan all the way to the end of the run of equal keys
930 : * on every insert. We implement "get tired" as a random choice,
931 : * since stopping after scanning a fixed number of pages wouldn't work
932 : * well (we'd never reach the right-hand side of previously split
933 : * pages). The probability of moving right is set at 0.99, which may
934 : * seem too high to change the behavior much, but it does an excellent
935 : * job of preventing O(N^2) behavior with many equal keys.
936 : *----------
937 : */
1481 pg 938 UIC 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
939 : {
1481 pg 940 EUB : /*
941 : * Before considering moving right, see if we can obtain enough
942 : * space by erasing LP_DEAD items
943 : */
873 pg 944 UIC 0 : if (P_HAS_GARBAGE(opaque))
945 : {
816 pg 946 EUB : /* Perform simple deletion */
873 pg 947 UIC 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
948 : false, false, false);
1481 pg 949 EUB :
1481 pg 950 UIC 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
951 0 : break; /* OK, now we have enough space */
1481 pg 952 EUB : }
5881 bruce 953 :
954 : /*
955 : * Nope, so check conditions (b) and (c) enumerated above
956 : *
957 : * The earlier _bt_check_unique() call may well have established a
958 : * strict upper bound on the offset for the new item. If it's not
959 : * the last item of the page (i.e. if there is at least one tuple
960 : * on the page that's greater than the tuple we're inserting to)
961 : * then we know that the tuple belongs on this page. We can skip
962 : * the high key check.
963 : */
1481 pg 964 UIC 0 : if (insertstate->bounds_valid &&
965 0 : insertstate->low <= insertstate->stricthigh &&
1481 pg 966 UBC 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
967 0 : break;
1481 pg 968 EUB :
873 pg 969 UBC 0 : if (P_RIGHTMOST(opaque) ||
1481 pg 970 UIC 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
497 tgl 971 UBC 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
1481 pg 972 EUB : break;
973 :
8 andres 974 UNC 0 : _bt_stepright(rel, heapRel, insertstate, stack);
975 : /* Update local state after stepping right */
1481 pg 976 UBC 0 : page = BufferGetPage(insertstate->buf);
373 michael 977 UIC 0 : opaque = BTPageGetOpaque(page);
1481 pg 978 EUB : }
979 : }
980 :
981 : /*
982 : * We should now be on the correct page. Find the offset within the page
983 : * for the new tuple. (Possibly reusing earlier search bounds.)
984 : */
873 pg 985 GIC 7166874 : Assert(P_RIGHTMOST(opaque) ||
986 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
3309 heikki.linnakangas 987 ECB :
1138 pg 988 GIC 7166874 : newitemoff = _bt_binsrch_insert(rel, insertstate);
989 :
1138 pg 990 CBC 7166874 : if (insertstate->postingoff == -1)
991 : {
1138 pg 992 ECB : /*
993 : * There is an overlapping posting list tuple with its LP_DEAD bit
994 : * set. We don't want to unnecessarily unset its LP_DEAD bit while
995 : * performing a posting list split, so perform simple index tuple
996 : * deletion early.
997 : */
873 pg 998 UIC 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
999 : false, false, false);
1138 pg 1000 EUB :
1001 : /*
1002 : * Do new binary search. New insert location cannot overlap with any
1003 : * posting list now.
1004 : */
873 pg 1005 UIC 0 : Assert(!insertstate->bounds_valid);
1138 1006 0 : insertstate->postingoff = 0;
1138 pg 1007 UBC 0 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1008 0 : Assert(insertstate->postingoff == 0);
1138 pg 1009 EUB : }
1010 :
1138 pg 1011 GIC 7166874 : return newitemoff;
1012 : }
3309 heikki.linnakangas 1013 ECB :
1014 : /*
1015 : * Step right to next non-dead page, during insertion.
1016 : *
1017 : * This is a bit more complicated than moving right in a search. We must
1018 : * write-lock the target page before releasing write lock on current page;
1019 : * else someone else's _bt_check_unique scan could fail to see our insertion.
1020 : * Write locks on intermediate dead pages won't do because we don't know when
1021 : * they will get de-linked from the tree.
1022 : *
1023 : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1024 : * indexes.
1025 : */
1026 : static void
8 andres 1027 GNC 5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
1028 : {
1481 pg 1029 ECB : Page page;
1030 : BTPageOpaque opaque;
1031 : Buffer rbuf;
1032 : BlockNumber rblkno;
1033 :
1481 pg 1034 GIC 5022 : page = BufferGetPage(insertstate->buf);
373 michael 1035 5022 : opaque = BTPageGetOpaque(page);
1481 pg 1036 ECB :
1481 pg 1037 CBC 5022 : rbuf = InvalidBuffer;
873 pg 1038 GIC 5022 : rblkno = opaque->btpo_next;
1481 pg 1039 ECB : for (;;)
1040 : {
1481 pg 1041 GIC 5022 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1042 5022 : page = BufferGetPage(rbuf);
373 michael 1043 CBC 5022 : opaque = BTPageGetOpaque(page);
3309 heikki.linnakangas 1044 ECB :
1481 pg 1045 : /*
1046 : * If this page was incompletely split, finish the split now. We do
1047 : * this while holding a lock on the left sibling, which is not good
1048 : * because finishing the split could be a fairly lengthy operation.
1049 : * But this should happen very seldom.
1050 : */
873 pg 1051 GIC 5022 : if (P_INCOMPLETE_SPLIT(opaque))
1052 : {
8 andres 1053 UNC 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
1481 pg 1054 UIC 0 : rbuf = InvalidBuffer;
1481 pg 1055 UBC 0 : continue;
5881 bruce 1056 EUB : }
9345 1057 :
873 pg 1058 GIC 5022 : if (!P_IGNORE(opaque))
1481 1059 5022 : break;
873 pg 1060 LBC 0 : if (P_RIGHTMOST(opaque))
1481 1061 0 : elog(ERROR, "fell off the end of index \"%s\"",
1481 pg 1062 EUB : RelationGetRelationName(rel));
5881 bruce 1063 :
873 pg 1064 UIC 0 : rblkno = opaque->btpo_next;
1065 : }
1481 pg 1066 EUB : /* rbuf locked; unlock buf, update state for caller */
1481 pg 1067 GIC 5022 : _bt_relbuf(rel, insertstate->buf);
1068 5022 : insertstate->buf = rbuf;
1481 pg 1069 CBC 5022 : insertstate->bounds_valid = false;
5881 bruce 1070 5022 : }
5881 bruce 1071 ECB :
1072 : /*----------
1073 : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1074 : *
1075 : * This recursive procedure does the following things:
1076 : *
1077 : * + if postingoff != 0, splits existing posting list tuple
1078 : * (since it overlaps with new 'itup' tuple).
1079 : * + if necessary, splits the target page, using 'itup_key' for
1080 : * suffix truncation on leaf pages (caller passes NULL for
1081 : * non-leaf pages).
1082 : * + inserts the new tuple (might be split from posting list).
1083 : * + if the page was split, pops the parent stack, and finds the
1084 : * right place to insert the new child pointer (by walking
1085 : * right using information stored in the parent stack).
1086 : * + invokes itself with the appropriate tuple for the right
1087 : * child page on the parent.
1088 : * + updates the metapage if a true root or fast root is split.
1089 : *
1090 : * On entry, we must have the correct buffer in which to do the
1091 : * insertion, and the buffer must be pinned and write-locked. On return,
1092 : * we will have dropped both the pin and the lock on the buffer.
1093 : *
1094 : * This routine only performs retail tuple insertions. 'itup' should
1095 : * always be either a non-highkey leaf item, or a downlink (new high
1096 : * key items are created indirectly, when a page is split). When
1097 : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1098 : * we're inserting the downlink for. This function will clear the
1099 : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1100 : *----------
1101 : */
1102 : static void
5881 bruce 1103 GIC 7187735 : _bt_insertonpg(Relation rel,
1104 : Relation heaprel,
1105 : BTScanInsert itup_key,
5881 bruce 1106 ECB : Buffer buf,
1107 : Buffer cbuf,
1108 : BTStack stack,
1109 : IndexTuple itup,
1110 : Size itemsz,
1111 : OffsetNumber newitemoff,
1112 : int postingoff,
1113 : bool split_only_page)
1114 : {
1115 : Page page;
1116 : BTPageOpaque opaque;
1117 : bool isleaf,
1118 : isroot,
1119 : isrightmost,
1120 : isonly;
1138 pg 1121 GIC 7187735 : IndexTuple oposting = NULL;
1122 7187735 : IndexTuple origitup = NULL;
1123 7187735 : IndexTuple nposting = NULL;
5881 bruce 1124 ECB :
2545 kgrittn 1125 CBC 7187735 : page = BufferGetPage(buf);
373 michael 1126 7187735 : opaque = BTPageGetOpaque(page);
873 pg 1127 GIC 7187735 : isleaf = P_ISLEAF(opaque);
873 pg 1128 CBC 7187735 : isroot = P_ISROOT(opaque);
1129 7187735 : isrightmost = P_RIGHTMOST(opaque);
1130 7187735 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
5881 bruce 1131 ECB :
3309 heikki.linnakangas 1132 : /* child buffer must be given iff inserting on an internal page */
873 pg 1133 CBC 7187735 : Assert(isleaf == !BufferIsValid(cbuf));
1134 : /* tuple must have appropriate number of attributes */
873 pg 1135 GIC 7187735 : Assert(!isleaf ||
1816 teodor 1136 ECB : BTreeTupleGetNAtts(itup, rel) ==
1137 : IndexRelationGetNumberOfAttributes(rel));
873 pg 1138 CBC 7187735 : Assert(isleaf ||
1139 : BTreeTupleGetNAtts(itup, rel) <=
1140 : IndexRelationGetNumberOfKeyAttributes(rel));
1138 1141 7187735 : Assert(!BTreeTupleIsPosting(itup));
1119 pg 1142 GIC 7187735 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1143 : /* Caller must always finish incomplete split for us */
873 pg 1144 CBC 7187735 : Assert(!P_INCOMPLETE_SPLIT(opaque));
3309 heikki.linnakangas 1145 ECB :
1146 : /*
1125 pg 1147 : * Every internal page should have exactly one negative infinity item at
1148 : * all times. Only _bt_split() and _bt_newroot() should add items that
1149 : * become negative infinity items through truncation, since they're the
1150 : * only routines that allocate new internal pages.
1151 : */
873 pg 1152 GIC 7187735 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1153 :
1154 : /*
1138 pg 1155 ECB : * Do we need to split an existing posting list item?
1156 : */
1138 pg 1157 GIC 7187735 : if (postingoff != 0)
1158 : {
1159 7766 : ItemId itemid = PageGetItemId(page, newitemoff);
1138 pg 1160 ECB :
1161 : /*
1162 : * The new tuple is a duplicate with a heap TID that falls inside the
1163 : * range of an existing posting list tuple on a leaf page. Prepare to
1164 : * split an existing posting list. Overwriting the posting list with
1165 : * its post-split version is treated as an extra step in either the
1166 : * insert or page split critical section.
1167 : */
529 pg 1168 GIC 7766 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1138 1169 7766 : oposting = (IndexTuple) PageGetItem(page, itemid);
1170 :
529 pg 1171 ECB : /*
1172 : * postingoff value comes from earlier call to _bt_binsrch_posting().
1173 : * Its binary search might think that a plain tuple must be a posting
1174 : * list tuple that needs to be split. This can happen with corruption
1175 : * involving an existing plain tuple that is a duplicate of the new
1176 : * item, up to and including its table TID. Check for that here in
1177 : * passing.
1178 : *
1179 : * Also verify that our caller has made sure that the existing posting
1180 : * list tuple does not have its LP_DEAD bit set.
1181 : */
529 pg 1182 GIC 7766 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
529 pg 1183 UIC 0 : ereport(ERROR,
1184 : (errcode(ERRCODE_INDEX_CORRUPTED),
529 pg 1185 ECB : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
529 pg 1186 EUB : ItemPointerGetBlockNumber(&itup->t_tid),
1187 : ItemPointerGetOffsetNumber(&itup->t_tid),
1188 : newitemoff, BufferGetBlockNumber(buf),
1189 : RelationGetRelationName(rel))));
1190 :
1191 : /* use a mutable copy of itup as our itup from here on */
1138 pg 1192 GIC 7766 : origitup = itup;
1193 7766 : itup = CopyIndexTuple(origitup);
1194 7766 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1138 pg 1195 ECB : /* itup now contains rightmost/max TID from oposting */
1196 :
1197 : /* Alter offset so that newitem goes after posting list */
1138 pg 1198 GIC 7766 : newitemoff = OffsetNumberNext(newitemoff);
1199 : }
1200 :
8297 tgl 1201 ECB : /*
1202 : * Do we need to split the page to fit the item on it?
1203 : *
1204 : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1205 : * so this comparison is correct even though we appear to be accounting
1206 : * only for the item and not for its line pointer.
1207 : */
8297 tgl 1208 GIC 7187735 : if (PageGetFreeSpace(page) < itemsz)
1209 : {
1210 : Buffer rbuf;
9345 bruce 1211 ECB :
1091 pg 1212 GIC 23479 : Assert(!split_only_page);
1213 :
1214 : /* split the buffer into left and right halves */
8 andres 1215 GNC 23479 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1216 : itup, origitup, nposting, postingoff);
4444 heikki.linnakangas 1217 GIC 23479 : PredicateLockPageSplit(rel,
4444 heikki.linnakangas 1218 ECB : BufferGetBlockNumber(buf),
1219 : BufferGetBlockNumber(rbuf));
9345 bruce 1220 :
1221 : /*----------
1222 : * By here,
1223 : *
1224 : * + our target page has been split;
1225 : * + the original tuple has been inserted;
1226 : * + we have write locks on both the old (left half)
1227 : * and new (right half) buffers, after the split; and
1228 : * + we know the key we want to insert into the parent
1229 : * (it's the "high key" on the left child page).
1230 : *
1231 : * We're ready to do the parent insertion. We need to hold onto the
1232 : * locks for the child pages until we locate the parent, but we can
1233 : * at least release the lock on the right child before doing the
1234 : * actual insertion. The lock on the left child will be released
1235 : * last of all by parent insertion, where it is the 'cbuf' of parent
1236 : * page.
1237 : *----------
1238 : */
8 andres 1239 GNC 23479 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1240 : }
1241 : else
7352 tgl 1242 ECB : {
7352 tgl 1243 GIC 7164256 : Buffer metabuf = InvalidBuffer;
1244 7164256 : Page metapg = NULL;
1245 7164256 : BTMetaPageData *metad = NULL;
1117 pg 1246 ECB : BlockNumber blockcache;
9345 bruce 1247 :
7352 tgl 1248 : /*
1249 : * If we are doing this insert because we split a page that was the
1250 : * only one on its tree level, but was not the root, it may have been
1251 : * the "fast root". We need to ensure that the fast root link points
1252 : * at or above the current page. We can safely acquire a lock on the
1253 : * metapage here --- see comments for _bt_newroot().
1254 : */
873 pg 1255 GIC 7164256 : if (unlikely(split_only_page))
1256 : {
1117 1257 12 : Assert(!isleaf);
1117 pg 1258 CBC 12 : Assert(BufferIsValid(cbuf));
1259 :
8 andres 1260 GNC 12 : metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
2545 kgrittn 1261 CBC 12 : metapg = BufferGetPage(metabuf);
7352 tgl 1262 GIC 12 : metad = BTPageGetMeta(metapg);
8105 vadim4o 1263 ECB :
774 pg 1264 CBC 12 : if (metad->btm_fastlevel >= opaque->btpo_level)
7352 tgl 1265 ECB : {
1266 : /* no update wanted */
7352 tgl 1267 LBC 0 : _bt_relbuf(rel, metabuf);
7352 tgl 1268 UIC 0 : metabuf = InvalidBuffer;
1269 : }
7352 tgl 1270 EUB : }
9345 bruce 1271 :
1272 : /* Do the update. No ereport(ERROR) until changes are logged */
7352 tgl 1273 GIC 7164256 : START_CRIT_SECTION();
1274 :
1138 pg 1275 7164256 : if (postingoff != 0)
1138 pg 1276 CBC 7740 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1277 :
1117 1278 7164256 : if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1117 pg 1279 ECB : false) == InvalidOffsetNumber)
4606 tgl 1280 UIC 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1118 pg 1281 ECB : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1282 :
6218 tgl 1283 GBC 7164256 : MarkBufferDirty(buf);
1284 :
7352 tgl 1285 GIC 7164256 : if (BufferIsValid(metabuf))
7352 tgl 1286 ECB : {
1287 : /* upgrade meta-page if needed */
1481 pg 1288 CBC 12 : if (metad->btm_version < BTREE_NOVAC_VERSION)
1831 teodor 1289 UIC 0 : _bt_upgrademetapage(metapg);
1118 pg 1290 GIC 12 : metad->btm_fastroot = BufferGetBlockNumber(buf);
774 pg 1291 CBC 12 : metad->btm_fastlevel = opaque->btpo_level;
6218 tgl 1292 GBC 12 : MarkBufferDirty(metabuf);
7352 tgl 1293 ECB : }
9345 bruce 1294 :
1091 pg 1295 : /*
1296 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1297 : * finishes a split
1298 : */
1091 pg 1299 GIC 7164256 : if (!isleaf)
1300 : {
2545 kgrittn 1301 20760 : Page cpage = BufferGetPage(cbuf);
373 michael 1302 CBC 20760 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1303 :
3309 heikki.linnakangas 1304 20760 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1305 20760 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
3309 heikki.linnakangas 1306 GIC 20760 : MarkBufferDirty(cbuf);
3309 heikki.linnakangas 1307 ECB : }
1308 :
7352 tgl 1309 : /* XLOG stuff */
4500 rhaas 1310 GIC 7164256 : if (RelationNeedsWAL(rel))
1311 : {
1312 : xl_btree_insert xlrec;
7352 tgl 1313 ECB : xl_btree_metadata xlmeta;
1314 : uint8 xlinfo;
1315 : XLogRecPtr recptr;
1316 : uint16 upostingoff;
1317 :
1118 pg 1318 GIC 6844337 : xlrec.offnum = newitemoff;
1319 :
3062 heikki.linnakangas 1320 6844337 : XLogBeginInsert();
3062 heikki.linnakangas 1321 CBC 6844337 : XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1322 :
1117 pg 1323 6844337 : if (isleaf && postingoff == 0)
1138 pg 1324 ECB : {
1325 : /* Simple leaf insert */
6205 tgl 1326 CBC 6816655 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1327 : }
1138 pg 1328 GIC 27682 : else if (postingoff != 0)
1138 pg 1329 ECB : {
1330 : /*
1331 : * Leaf insert with posting list split. Must include
1332 : * postingoff field before newitem/orignewitem.
1333 : */
1090 pg 1334 GIC 7740 : Assert(isleaf);
1138 1335 7740 : xlinfo = XLOG_BTREE_INSERT_POST;
1336 : }
6205 tgl 1337 ECB : else
1338 : {
1339 : /* Internal page insert, which finishes a split on cbuf */
6205 tgl 1340 GIC 19942 : xlinfo = XLOG_BTREE_INSERT_UPPER;
1091 pg 1341 19942 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1342 :
1090 pg 1343 CBC 19942 : if (BufferIsValid(metabuf))
1090 pg 1344 ECB : {
1345 : /* Actually, it's an internal page insert + meta update */
1090 pg 1346 CBC 12 : xlinfo = XLOG_BTREE_INSERT_META;
1347 :
1090 pg 1348 GIC 12 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1090 pg 1349 CBC 12 : xlmeta.version = metad->btm_version;
1090 pg 1350 GIC 12 : xlmeta.root = metad->btm_root;
1090 pg 1351 CBC 12 : xlmeta.level = metad->btm_level;
1352 12 : xlmeta.fastroot = metad->btm_fastroot;
1353 12 : xlmeta.fastlevel = metad->btm_fastlevel;
774 1354 12 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1090 1355 12 : xlmeta.allequalimage = metad->btm_allequalimage;
1090 pg 1356 ECB :
1090 pg 1357 CBC 12 : XLogRegisterBuffer(2, metabuf,
1090 pg 1358 ECB : REGBUF_WILL_INIT | REGBUF_STANDARD);
1090 pg 1359 GIC 12 : XLogRegisterBufData(2, (char *) &xlmeta,
1090 pg 1360 ECB : sizeof(xl_btree_metadata));
1361 : }
7352 tgl 1362 : }
1363 :
3062 heikki.linnakangas 1364 GIC 6844337 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1138 pg 1365 6844337 : if (postingoff == 0)
1366 : {
1090 pg 1367 ECB : /* Just log itup from caller */
1138 pg 1368 CBC 6836597 : XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1369 : }
1370 : else
1138 pg 1371 ECB : {
1372 : /*
1373 : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1374 : * record) case.
1375 : *
1376 : * Log postingoff. Also log origitup, not itup. REDO routine
1377 : * must reconstruct final itup (as well as nposting) using
1378 : * _bt_swap_posting().
1379 : */
1074 pg 1380 GIC 7740 : upostingoff = postingoff;
1381 :
1138 1382 7740 : XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1138 pg 1383 CBC 7740 : XLogRegisterBufData(0, (char *) origitup,
1138 pg 1384 GIC 7740 : IndexTupleSize(origitup));
1138 pg 1385 ECB : }
7352 tgl 1386 :
3062 heikki.linnakangas 1387 CBC 6844337 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1388 :
7352 tgl 1389 GIC 6844337 : if (BufferIsValid(metabuf))
7352 tgl 1390 CBC 12 : PageSetLSN(metapg, recptr);
1091 pg 1391 GIC 6844337 : if (!isleaf)
2545 kgrittn 1392 CBC 19942 : PageSetLSN(BufferGetPage(cbuf), recptr);
9345 bruce 1393 ECB :
7352 tgl 1394 CBC 6844337 : PageSetLSN(page, recptr);
9620 vadim4o 1395 ECB : }
1396 :
7352 tgl 1397 CBC 7164256 : END_CRIT_SECTION();
1398 :
1399 : /* Release subsidiary buffers */
1400 7164256 : if (BufferIsValid(metabuf))
6218 tgl 1401 GIC 12 : _bt_relbuf(rel, metabuf);
1091 pg 1402 7164256 : if (!isleaf)
3309 heikki.linnakangas 1403 CBC 20760 : _bt_relbuf(rel, cbuf);
1117 pg 1404 ECB :
1405 : /*
1406 : * Cache the block number if this is the rightmost leaf page. Cache
1407 : * may be used by a future inserter within _bt_search_insert().
1408 : */
1117 pg 1409 GIC 7164256 : blockcache = InvalidBlockNumber;
873 1410 7164256 : if (isrightmost && isleaf && !isroot)
1117 1411 3449620 : blockcache = BufferGetBlockNumber(buf);
1117 pg 1412 ECB :
1413 : /* Release buffer for insertion target block */
6218 tgl 1414 CBC 7164256 : _bt_relbuf(rel, buf);
1415 :
1416 : /*
1117 pg 1417 ECB : * If we decided to cache the insertion target block before releasing
1418 : * its buffer lock, then cache it now. Check the height of the tree
1419 : * first, though. We don't go for the optimization with small
1420 : * indexes. Defer final check to this point to ensure that we don't
1421 : * call _bt_getrootheight while holding a buffer lock.
1422 : */
1117 pg 1423 GIC 10613876 : if (BlockNumberIsValid(blockcache) &&
8 andres 1424 GNC 3449620 : _bt_getrootheight(rel, heaprel) >= BTREE_FASTPATH_MIN_LEVEL)
1117 pg 1425 GIC 34278 : RelationSetTargetBlock(rel, blockcache);
9345 bruce 1426 ECB : }
1138 pg 1427 :
1428 : /* be tidy */
1138 pg 1429 GIC 7187735 : if (postingoff != 0)
1430 : {
1431 : /* itup is actually a modified copy of caller's original */
1138 pg 1432 CBC 7766 : pfree(nposting);
1138 pg 1433 GIC 7766 : pfree(itup);
1434 : }
9770 scrappy 1435 CBC 7187735 : }
9770 scrappy 1436 ECB :
1437 : /*
9345 bruce 1438 : * _bt_split() -- split a page in the btree.
1439 : *
1440 : * On entry, buf is the page to split, and is pinned and write-locked.
1441 : * newitemoff etc. tell us about the new item that must be inserted
1442 : * along with the data from the original page.
1443 : *
1444 : * itup_key is used for suffix truncation on leaf pages (internal
1445 : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1446 : * is the left-sibling of the page we're inserting the downlink for.
1447 : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1448 : * release the buffer.
1449 : *
1450 : * orignewitem, nposting, and postingoff are needed when an insert of
1451 : * orignewitem results in both a posting list split and a page split.
1452 : * These extra posting list split details are used here in the same
1453 : * way as they are used in the more common case where a posting list
1454 : * split does not coincide with a page split. We need to deal with
1455 : * posting list splits directly in order to ensure that everything
1456 : * that follows from the insert of orignewitem is handled as a single
1457 : * atomic operation (though caller's insert of a new pivot/downlink
1458 : * into parent page will still be a separate operation). See
1459 : * nbtree/README for details on the design of posting list splits.
1460 : *
1461 : * Returns the new right sibling of buf, pinned and write-locked.
1462 : * The pin and lock on buf are maintained.
1463 : */
1464 : static Buffer
8 andres 1465 GNC 23479 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1466 : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1467 : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
9770 scrappy 1468 ECB : {
1469 : Buffer rbuf;
1470 : Page origpage;
1471 : Page leftpage,
1472 : rightpage;
1473 : BlockNumber origpagenumber,
1474 : rightpagenumber;
1475 : BTPageOpaque ropaque,
1476 : lopaque,
1477 : oopaque;
7351 tgl 1478 GIC 23479 : Buffer sbuf = InvalidBuffer;
1479 23479 : Page spage = NULL;
1480 23479 : BTPageOpaque sopaque = NULL;
9344 bruce 1481 ECB : Size itemsz;
1482 : ItemId itemid;
1091 pg 1483 : IndexTuple firstright,
1484 : lefthighkey;
1485 : OffsetNumber firstrightoff;
1486 : OffsetNumber afterleftoff,
1487 : afterrightoff,
1488 : minusinfoff;
1489 : OffsetNumber origpagepostingoff;
1490 : OffsetNumber maxoff;
1491 : OffsetNumber i;
1492 : bool newitemonleft,
1493 : isleaf,
1494 : isrightmost;
1495 :
1496 : /*
1497 : * origpage is the original page to be split. leftpage is a temporary
1498 : * buffer that receives the left-sibling data, which will be copied back
1499 : * into origpage on success. rightpage is the new page that will receive
1500 : * the right-sibling data.
1501 : *
1502 : * leftpage is allocated after choosing a split point. rightpage's new
1503 : * buffer isn't acquired until after leftpage is initialized and has new
1504 : * high key, the last point where splitting the page may fail (barring
1505 : * corruption). Failing before acquiring new buffer won't have lasting
1506 : * consequences, since origpage won't have been modified and leftpage is
1507 : * only workspace.
1508 : */
2545 kgrittn 1509 GIC 23479 : origpage = BufferGetPage(buf);
373 michael 1510 23479 : oopaque = BTPageGetOpaque(origpage);
1091 pg 1511 23479 : isleaf = P_ISLEAF(oopaque);
1091 pg 1512 CBC 23479 : isrightmost = P_RIGHTMOST(oopaque);
1513 23479 : maxoff = PageGetMaxOffsetNumber(origpage);
4606 tgl 1514 23479 : origpagenumber = BufferGetBlockNumber(buf);
9345 bruce 1515 ECB :
5842 tgl 1516 : /*
1427 pg 1517 : * Choose a point to split origpage at.
1518 : *
1519 : * A split point can be thought of as a point _between_ two existing data
1520 : * items on origpage (the lastleft and firstright tuples), provided you
1521 : * pretend that the new item that didn't fit is already on origpage.
1522 : *
1523 : * Since origpage does not actually contain newitem, the representation of
1524 : * split points needs to work with two boundary cases: splits where
1525 : * newitem is lastleft, and splits where newitem is firstright.
1526 : * newitemonleft resolves the ambiguity that would otherwise exist when
1527 : * newitemoff == firstrightoff. In all other cases it's clear which side
1528 : * of the split every tuple goes on from context. newitemonleft is
1529 : * usually (but not always) redundant information.
1530 : *
1531 : * firstrightoff is supposed to be an origpage offset number, but it's
1532 : * possible that its value will be maxoff+1, which is "past the end" of
1533 : * origpage. This happens in the rare case where newitem goes after all
1534 : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1535 : * origpage at the point that leaves newitem alone on new right page. Any
1536 : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1537 : * newitem the firstright tuple, though, so this case isn't a special
1538 : * case.
1539 : */
1091 pg 1540 GIC 23479 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1541 : newitem, &newitemonleft);
1542 :
1427 pg 1543 ECB : /* Allocate temp buffer for leftpage */
1427 pg 1544 GIC 23479 : leftpage = PageGetTempPage(origpage);
1545 23479 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
373 michael 1546 23479 : lopaque = BTPageGetOpaque(leftpage);
9345 bruce 1547 ECB :
1427 pg 1548 : /*
1549 : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1550 : * and HAS_GARBAGE flags.
1551 : */
8222 vadim4o 1552 GIC 23479 : lopaque->btpo_flags = oopaque->btpo_flags;
6102 tgl 1553 23479 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1554 : /* set flag in leftpage indicating that rightpage has no downlink yet */
3309 heikki.linnakangas 1555 CBC 23479 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
9345 bruce 1556 23479 : lopaque->btpo_prev = oopaque->btpo_prev;
1557 : /* handle btpo_next after rightpage buffer acquired */
774 pg 1558 23479 : lopaque->btpo_level = oopaque->btpo_level;
1427 pg 1559 ECB : /* handle btpo_cycleid after rightpage buffer acquired */
1560 :
9345 bruce 1561 : /*
1562 : * Copy the original page's LSN into leftpage, which will become the
1563 : * updated version of the page. We need this because XLogInsert will
1564 : * examine the LSN and possibly dump it in a page image.
1565 : */
1427 pg 1566 GIC 23479 : PageSetLSN(leftpage, PageGetLSN(origpage));
1567 :
1568 : /*
1138 pg 1569 ECB : * Determine page offset number of existing overlapped-with-orignewitem
1570 : * posting list when it is necessary to perform a posting list split in
1571 : * passing. Note that newitem was already changed by caller (newitem no
1572 : * longer has the orignewitem TID).
1573 : *
1574 : * This page offset number (origpagepostingoff) will be used to pretend
1575 : * that the posting split has already taken place, even though the
1576 : * required modifications to origpage won't occur until we reach the
1577 : * critical section. The lastleft and firstright tuples of our page split
1578 : * point should, in effect, come from an imaginary version of origpage
1579 : * that has the nposting tuple instead of the original posting list tuple.
1580 : *
1581 : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1582 : * list splits in just the same way, at least in theory. It doesn't
1583 : * bother with that, though. In practice it won't affect its choice of
1584 : * split point.
1585 : */
1138 pg 1586 GIC 23479 : origpagepostingoff = InvalidOffsetNumber;
1587 23479 : if (postingoff != 0)
1588 : {
1138 pg 1589 CBC 26 : Assert(isleaf);
1590 26 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1591 : &newitem->t_tid) < 0);
1592 26 : Assert(BTreeTupleIsPosting(nposting));
1593 26 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1594 : }
1138 pg 1595 ECB :
8297 tgl 1596 : /*
1597 : * The high key for the new left page is a possibly-truncated copy of
1598 : * firstright on the leaf level (it's "firstright itself" on internal
1599 : * pages; see !isleaf comments below). This may seem to be contrary to
1600 : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1601 : * when splitting on the leaf level. It isn't, though.
1602 : *
1603 : * Suffix truncation will leave the left page's high key fully equal to
1604 : * lastleft when lastleft and firstright are equal prior to heap TID (that
1605 : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1606 : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1607 : * "subtree" invariant to hold. It's sufficient to make sure that the new
1608 : * leaf high key is strictly less than firstright, and greater than or
1609 : * equal to (not necessarily equal to) lastleft. In other words, when
1610 : * suffix truncation isn't possible during a leaf page split, we take
1611 : * L&Y's exact approach to generating a new high key for the left page.
1612 : * (Actually, that is slightly inaccurate. We don't just use a copy of
1613 : * lastleft. A tuple with all the keys from firstright but the max heap
1614 : * TID from lastleft is used, to avoid introducing a special case.)
1615 : */
1091 pg 1616 GIC 23479 : if (!newitemonleft && newitemoff == firstrightoff)
1617 : {
1618 : /* incoming tuple becomes firstright */
8297 tgl 1619 CBC 17 : itemsz = newitemsz;
1091 pg 1620 GIC 17 : firstright = newitem;
1621 : }
8297 tgl 1622 ECB : else
9345 bruce 1623 : {
1624 : /* existing item at firstrightoff becomes firstright */
1091 pg 1625 GIC 23462 : itemid = PageGetItemId(origpage, firstrightoff);
8297 tgl 1626 23462 : itemsz = ItemIdGetLength(itemid);
1091 pg 1627 23462 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1091 pg 1628 CBC 23462 : if (firstrightoff == origpagepostingoff)
1091 pg 1629 LBC 0 : firstright = nposting;
9345 bruce 1630 ECB : }
1828 teodor 1631 :
1091 pg 1632 GBC 23479 : if (isleaf)
1633 : {
1634 : IndexTuple lastleft;
1481 pg 1635 ECB :
1636 : /* Attempt suffix truncation for leaf page splits */
1091 pg 1637 GIC 23378 : if (newitemonleft && newitemoff == firstrightoff)
1638 : {
1639 : /* incoming tuple becomes lastleft */
1481 pg 1640 CBC 163 : lastleft = newitem;
1641 : }
1642 : else
1481 pg 1643 ECB : {
1644 : OffsetNumber lastleftoff;
1645 :
1646 : /* existing item before firstrightoff becomes lastleft */
1091 pg 1647 GIC 23215 : lastleftoff = OffsetNumberPrev(firstrightoff);
1481 1648 23215 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1649 23215 : itemid = PageGetItemId(origpage, lastleftoff);
1481 pg 1650 CBC 23215 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1138 1651 23215 : if (lastleftoff == origpagepostingoff)
1652 3 : lastleft = nposting;
1481 pg 1653 ECB : }
1654 :
1091 pg 1655 CBC 23378 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1091 pg 1656 GIC 23378 : itemsz = IndexTupleSize(lefthighkey);
1657 : }
1828 teodor 1658 ECB : else
1091 pg 1659 : {
1660 : /*
1661 : * Don't perform suffix truncation on a copy of firstright to make
1662 : * left page high key for internal page splits. Must use firstright
1663 : * as new high key directly.
1664 : *
1665 : * Each distinct separator key value originates as a leaf level high
1666 : * key; all other separator keys/pivot tuples are copied from one
1667 : * level down. A separator key in a grandparent page must be
1668 : * identical to high key in rightmost parent page of the subtree to
1669 : * its left, which must itself be identical to high key in rightmost
1670 : * child page of that same subtree (this even applies to separator
1671 : * from grandparent's high key). There must always be an unbroken
1672 : * "seam" of identical separator keys that guide index scans at every
1673 : * level, starting from the grandparent. That's why suffix truncation
1674 : * is unsafe here.
1675 : *
1676 : * Internal page splits will truncate firstright into a "negative
1677 : * infinity" data item when it gets inserted on the new right page
1678 : * below, though. This happens during the call to _bt_pgaddtup() for
1679 : * the new first data item for right page. Do not confuse this
1680 : * mechanism with suffix truncation. It is just a convenient way of
1681 : * implementing page splits that split the internal page "inside"
1682 : * firstright. The lefthighkey separator key cannot appear a second
1683 : * time in the right page (only firstright's downlink goes in right
1684 : * page).
1685 : */
1091 pg 1686 GIC 101 : lefthighkey = firstright;
1687 : }
1688 :
1427 pg 1689 ECB : /*
1690 : * Add new high key to leftpage
1691 : */
1091 pg 1692 GIC 23479 : afterleftoff = P_HIKEY;
1693 :
1694 23479 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1091 pg 1695 CBC 23479 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1696 : IndexRelationGetNumberOfKeyAttributes(rel));
1697 23479 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1698 23479 : if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1699 : false) == InvalidOffsetNumber)
1091 pg 1700 LBC 0 : elog(ERROR, "failed to add high key to the left sibling"
5578 tgl 1701 ECB : " while splitting block %u of index \"%s\"",
1702 : origpagenumber, RelationGetRelationName(rel));
1091 pg 1703 GBC 23479 : afterleftoff = OffsetNumberNext(afterleftoff);
1704 :
1705 : /*
1427 pg 1706 ECB : * Acquire a new right page to split into, now that left page has a new
1707 : * high key. From here on, it's not okay to throw an error without
1708 : * zeroing rightpage first. This coding rule ensures that we won't
1709 : * confuse future VACUUM operations, which might otherwise try to re-find
1710 : * a downlink to a leftover junk page as the page undergoes deletion.
1711 : *
1712 : * It would be reasonable to start the critical section just after the new
1713 : * rightpage buffer is acquired instead; that would allow us to avoid
1714 : * leftover junk pages without bothering to zero rightpage. We do it this
1715 : * way because it avoids an unnecessary PANIC when either origpage or its
1716 : * existing sibling page are corrupt.
1717 : */
8 andres 1718 GNC 23479 : rbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
1427 pg 1719 GIC 23479 : rightpage = BufferGetPage(rbuf);
1720 23479 : rightpagenumber = BufferGetBlockNumber(rbuf);
1427 pg 1721 ECB : /* rightpage was initialized by _bt_getbuf */
373 michael 1722 CBC 23479 : ropaque = BTPageGetOpaque(rightpage);
1427 pg 1723 ECB :
1724 : /*
1725 : * Finish off remaining leftpage special area fields. They cannot be set
1726 : * before both origpage (leftpage) and rightpage buffers are acquired and
1727 : * locked.
1728 : *
1729 : * btpo_cycleid is only used with leaf pages, though we set it here in all
1730 : * cases just to be consistent.
1731 : */
1427 pg 1732 GIC 23479 : lopaque->btpo_next = rightpagenumber;
1733 23479 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1734 :
1427 pg 1735 ECB : /*
1736 : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1737 : * and HAS_GARBAGE flags.
1738 : */
1427 pg 1739 GIC 23479 : ropaque->btpo_flags = oopaque->btpo_flags;
1740 23479 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1741 23479 : ropaque->btpo_prev = origpagenumber;
1427 pg 1742 CBC 23479 : ropaque->btpo_next = oopaque->btpo_next;
774 1743 23479 : ropaque->btpo_level = oopaque->btpo_level;
1427 1744 23479 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1427 pg 1745 ECB :
1746 : /*
1747 : * Add new high key to rightpage where necessary.
1748 : *
1749 : * If the page we're splitting is not the rightmost page at its level in
1750 : * the tree, then the first entry on the page is the high key from
1751 : * origpage.
1752 : */
1091 pg 1753 GIC 23479 : afterrightoff = P_HIKEY;
1754 :
1755 23479 : if (!isrightmost)
1427 pg 1756 ECB : {
1757 : IndexTuple righthighkey;
1091 1758 :
1427 pg 1759 GIC 11299 : itemid = PageGetItemId(origpage, P_HIKEY);
1760 11299 : itemsz = ItemIdGetLength(itemid);
1091 1761 11299 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1091 pg 1762 CBC 11299 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1763 11299 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1091 pg 1764 ECB : IndexRelationGetNumberOfKeyAttributes(rel));
1091 pg 1765 CBC 11299 : if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1427 pg 1766 ECB : false, false) == InvalidOffsetNumber)
1767 : {
1427 pg 1768 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1091 pg 1769 UIC 0 : elog(ERROR, "failed to add high key to the right sibling"
1770 : " while splitting block %u of index \"%s\"",
1427 pg 1771 EUB : origpagenumber, RelationGetRelationName(rel));
1772 : }
1091 pg 1773 GIC 11299 : afterrightoff = OffsetNumberNext(afterrightoff);
1774 : }
1775 :
1091 pg 1776 ECB : /*
1777 : * Internal page splits truncate first data item on right page -- it
1778 : * becomes "minus infinity" item for the page. Set this up here.
1779 : */
1091 pg 1780 GIC 23479 : minusinfoff = InvalidOffsetNumber;
1781 23479 : if (!isleaf)
1782 101 : minusinfoff = afterrightoff;
1091 pg 1783 ECB :
1427 1784 : /*
1785 : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1786 : * additional pivot tuples in !isleaf case) to the appropriate page.
1787 : *
1788 : * Note: we *must* insert at least the right page's items in item-number
1789 : * order, for the benefit of _bt_restore_page().
1790 : */
8297 tgl 1791 GIC 6963551 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1792 : {
1793 : IndexTuple dataitem;
1091 pg 1794 ECB :
9345 bruce 1795 GIC 6940072 : itemid = PageGetItemId(origpage, i);
1796 6940072 : itemsz = ItemIdGetLength(itemid);
1091 pg 1797 6940072 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
9345 bruce 1798 ECB :
1138 pg 1799 : /* replace original item with nposting due to posting split? */
1138 pg 1800 CBC 6940072 : if (i == origpagepostingoff)
1801 : {
1091 pg 1802 GIC 26 : Assert(BTreeTupleIsPosting(dataitem));
1138 pg 1803 CBC 26 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1091 pg 1804 GIC 26 : dataitem = nposting;
1138 pg 1805 ECB : }
1806 :
8297 tgl 1807 : /* does new item belong before this one? */
1138 pg 1808 GIC 6940046 : else if (i == newitemoff)
1809 : {
8297 tgl 1810 13783 : if (newitemonleft)
8297 tgl 1811 ECB : {
1091 pg 1812 GIC 4376 : Assert(newitemoff <= firstrightoff);
1091 pg 1813 CBC 4376 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1814 : false))
4606 tgl 1815 ECB : {
4606 tgl 1816 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
4606 tgl 1817 UIC 0 : elog(ERROR, "failed to add new item to the left sibling"
1818 : " while splitting block %u of index \"%s\"",
4606 tgl 1819 EUB : origpagenumber, RelationGetRelationName(rel));
1820 : }
1091 pg 1821 GIC 4376 : afterleftoff = OffsetNumberNext(afterleftoff);
1822 : }
1823 : else
8297 tgl 1824 ECB : {
1091 pg 1825 GIC 9407 : Assert(newitemoff >= firstrightoff);
1826 9407 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1827 : afterrightoff == minusinfoff))
4606 tgl 1828 ECB : {
4606 tgl 1829 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
4606 tgl 1830 UIC 0 : elog(ERROR, "failed to add new item to the right sibling"
1831 : " while splitting block %u of index \"%s\"",
4606 tgl 1832 EUB : origpagenumber, RelationGetRelationName(rel));
1833 : }
1091 pg 1834 GIC 9407 : afterrightoff = OffsetNumberNext(afterrightoff);
1835 : }
1836 : }
8297 tgl 1837 ECB :
1838 : /* decide which page to put it on */
1091 pg 1839 GIC 6940072 : if (i < firstrightoff)
1840 : {
1841 5140479 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
4606 tgl 1842 ECB : {
4606 tgl 1843 UIC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
4606 tgl 1844 LBC 0 : elog(ERROR, "failed to add old item to the left sibling"
1845 : " while splitting block %u of index \"%s\"",
4606 tgl 1846 EUB : origpagenumber, RelationGetRelationName(rel));
1847 : }
1091 pg 1848 GIC 5140479 : afterleftoff = OffsetNumberNext(afterleftoff);
1849 : }
1850 : else
9345 bruce 1851 ECB : {
1091 pg 1852 GIC 1799593 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1853 : afterrightoff == minusinfoff))
1854 : {
4606 tgl 1855 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
4606 tgl 1856 UIC 0 : elog(ERROR, "failed to add old item to the right sibling"
1857 : " while splitting block %u of index \"%s\"",
4606 tgl 1858 EUB : origpagenumber, RelationGetRelationName(rel));
1859 : }
1091 pg 1860 GIC 1799593 : afterrightoff = OffsetNumberNext(afterrightoff);
1861 : }
1862 : }
9345 bruce 1863 ECB :
1864 : /* Handle case where newitem goes at the end of rightpage */
8297 tgl 1865 GIC 23479 : if (i <= newitemoff)
1866 : {
1867 : /*
5906 tgl 1868 ECB : * Can't have newitemonleft here; that would imply we were told to put
1869 : * *everything* on the left page, which cannot fit (if it could, we'd
1870 : * not be splitting the page).
1871 : */
1091 pg 1872 GIC 9696 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1873 9696 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1874 : afterrightoff == minusinfoff))
4606 tgl 1875 ECB : {
4606 tgl 1876 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
4606 tgl 1877 UIC 0 : elog(ERROR, "failed to add new item to the right sibling"
1878 : " while splitting block %u of index \"%s\"",
4606 tgl 1879 EUB : origpagenumber, RelationGetRelationName(rel));
1880 : }
1091 pg 1881 GIC 9696 : afterrightoff = OffsetNumberNext(afterrightoff);
1882 : }
1883 :
8222 vadim4o 1884 ECB : /*
1885 : * We have to grab the original right sibling (if any) and update its prev
1886 : * link. We are guaranteed that this is deadlock-free, since we couple
1887 : * the locks in the standard order: left to right.
1888 : */
1091 pg 1889 GIC 23479 : if (!isrightmost)
1890 : {
8 andres 1891 GNC 11299 : sbuf = _bt_getbuf(rel, heaprel, oopaque->btpo_next, BT_WRITE);
2545 kgrittn 1892 CBC 11299 : spage = BufferGetPage(sbuf);
373 michael 1893 GIC 11299 : sopaque = BTPageGetOpaque(spage);
4606 tgl 1894 CBC 11299 : if (sopaque->btpo_prev != origpagenumber)
4606 tgl 1895 ECB : {
4606 tgl 1896 LBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1347 peter 1897 0 : ereport(ERROR,
1898 : (errcode(ERRCODE_INDEX_CORRUPTED),
1347 peter 1899 EUB : errmsg_internal("right sibling's left-link doesn't match: "
1900 : "block %u links to %u instead of expected %u in index \"%s\"",
1901 : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1902 : RelationGetRelationName(rel))));
1903 : }
1904 :
1905 : /*
1906 : * Check to see if we can set the SPLIT_END flag in the right-hand
1907 : * split page; this can save some I/O for vacuum since it need not
1908 : * proceed to the right sibling. We can set the flag if the right
1909 : * sibling has a different cycleid: that means it could not be part of
1910 : * a group of pages that were all split off from the same ancestor
1911 : * page. If you're confused, imagine that page A splits to A B and
1912 : * then again, yielding A C B, while vacuum is in progress. Tuples
1913 : * originally in A could now be in either B or C, hence vacuum must
1914 : * examine both pages. But if D, our right sibling, has a different
1915 : * cycleid then it could not contain any tuples that were in A when
1916 : * the vacuum started.
1917 : */
6180 tgl 1918 GIC 11299 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
6180 tgl 1919 UIC 0 : ropaque->btpo_flags |= BTP_SPLIT_END;
1920 : }
8222 vadim4o 1921 ECB :
8222 vadim4o 1922 EUB : /*
1923 : * Right sibling is locked, new siblings are prepared, but original page
1924 : * is not updated yet.
1925 : *
1926 : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1927 : * not starting the critical section till here because we haven't been
1928 : * scribbling on the original page yet; see comments above.
1929 : */
8122 tgl 1930 GIC 23479 : START_CRIT_SECTION();
1931 :
1932 : /*
5904 bruce 1933 ECB : * By here, the original data page has been split into two new halves, and
1934 : * these are correct. The algorithm requires that the left page never
1935 : * move during a split, so we copy the new left page back on top of the
1936 : * original. We need to do this before writing the WAL record, so that
1937 : * XLogInsert can WAL log an image of the page if necessary.
1938 : */
5904 bruce 1939 GIC 23479 : PageRestoreTempPage(leftpage, origpage);
1940 : /* leftpage, lopaque must not be used below here */
1941 :
5842 tgl 1942 CBC 23479 : MarkBufferDirty(buf);
5842 tgl 1943 GIC 23479 : MarkBufferDirty(rbuf);
1944 :
1091 pg 1945 CBC 23479 : if (!isrightmost)
5842 tgl 1946 ECB : {
4606 tgl 1947 GIC 11299 : sopaque->btpo_prev = rightpagenumber;
5842 tgl 1948 CBC 11299 : MarkBufferDirty(sbuf);
1949 : }
5842 tgl 1950 ECB :
3309 heikki.linnakangas 1951 : /*
1952 : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1953 : * a split
1954 : */
3309 heikki.linnakangas 1955 GIC 23479 : if (!isleaf)
1956 : {
2545 kgrittn 1957 101 : Page cpage = BufferGetPage(cbuf);
373 michael 1958 CBC 101 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1959 :
3309 heikki.linnakangas 1960 101 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1961 101 : MarkBufferDirty(cbuf);
1962 : }
3309 heikki.linnakangas 1963 ECB :
7551 tgl 1964 : /* XLOG stuff */
4500 rhaas 1965 GIC 23479 : if (RelationNeedsWAL(rel))
1966 : {
1967 : xl_btree_split xlrec;
7352 tgl 1968 ECB : uint8 xlinfo;
1969 : XLogRecPtr recptr;
1970 :
774 pg 1971 GIC 22648 : xlrec.level = ropaque->btpo_level;
1972 : /* See comments below on newitem, orignewitem, and posting lists */
1091 1973 22648 : xlrec.firstrightoff = firstrightoff;
3062 heikki.linnakangas 1974 CBC 22648 : xlrec.newitemoff = newitemoff;
1138 pg 1975 GIC 22648 : xlrec.postingoff = 0;
1091 pg 1976 CBC 22648 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
1138 1977 17 : xlrec.postingoff = postingoff;
8137 vadim4o 1978 ECB :
3062 heikki.linnakangas 1979 CBC 22648 : XLogBeginInsert();
1980 22648 : XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1981 :
1982 22648 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1983 22648 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
1984 : /* Log original right sibling, since we've changed its prev-pointer */
1091 pg 1985 22648 : if (!isrightmost)
3062 heikki.linnakangas 1986 11293 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
1091 pg 1987 GIC 22648 : if (!isleaf)
3062 heikki.linnakangas 1988 CBC 101 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
5904 bruce 1989 ECB :
5842 tgl 1990 : /*
3062 heikki.linnakangas 1991 : * Log the new item, if it was inserted on the left page. (If it was
1992 : * put on the right page, we don't need to explicitly WAL log it
1993 : * because it's included with all the other items on the right page.)
1994 : * Show the new item as belonging to the left page buffer, so that it
1995 : * is not stored if XLogInsert decides it needs a full-page image of
1996 : * the left page. We always store newitemoff in the record, though.
1997 : *
1998 : * The details are sometimes slightly different for page splits that
1999 : * coincide with a posting list split. If both the replacement
2000 : * posting list and newitem go on the right page, then we don't need
2001 : * to log anything extra, just like the simple !newitemonleft
2002 : * no-posting-split case (postingoff is set to zero in the WAL record,
2003 : * so recovery doesn't need to process a posting list split at all).
2004 : * Otherwise, we set postingoff and log orignewitem instead of
2005 : * newitem, despite having actually inserted newitem. REDO routine
2006 : * must reconstruct nposting and newitem using _bt_swap_posting().
2007 : *
2008 : * Note: It's possible that our page split point is the point that
2009 : * makes the posting list lastleft and newitem firstright. This is
2010 : * the only case where we log orignewitem/newitem despite newitem
2011 : * going on the right page. If XLogInsert decides that it can omit
2012 : * orignewitem due to logging a full-page image of the left page,
2013 : * everything still works out, since recovery only needs to log
2014 : * orignewitem for items on the left page (just like the regular
2015 : * newitem-logged case).
2016 : */
1138 pg 2017 GIC 22648 : if (newitemonleft && xlrec.postingoff == 0)
1091 2018 4359 : XLogRegisterBufData(0, (char *) newitem, newitemsz);
1138 2019 18289 : else if (xlrec.postingoff != 0)
1138 pg 2020 ECB : {
1091 pg 2021 CBC 17 : Assert(isleaf);
2022 17 : Assert(newitemonleft || firstrightoff == newitemoff);
1091 pg 2023 GIC 17 : Assert(newitemsz == IndexTupleSize(orignewitem));
1091 pg 2024 CBC 17 : XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
1138 pg 2025 ECB : }
3309 heikki.linnakangas 2026 :
1481 pg 2027 : /* Log the left page's new high key */
1091 pg 2028 GIC 22648 : if (!isleaf)
2029 : {
2030 : /* lefthighkey isn't local copy, get current pointer */
1091 pg 2031 CBC 101 : itemid = PageGetItemId(origpage, P_HIKEY);
1091 pg 2032 GIC 101 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2033 : }
1091 pg 2034 CBC 22648 : XLogRegisterBufData(0, (char *) lefthighkey,
2035 22648 : MAXALIGN(IndexTupleSize(lefthighkey)));
2036 :
5842 tgl 2037 ECB : /*
2038 : * Log the contents of the right page in the format understood by
2039 : * _bt_restore_page(). The whole right page will be recreated.
2040 : *
2041 : * Direct access to page is not good but faster - we should implement
2042 : * some new func in page API. Note we only store the tuples
2043 : * themselves, knowing that they were inserted in item-number order
2044 : * and so the line pointers can be reconstructed. See comments for
2045 : * _bt_restore_page().
2046 : */
3062 heikki.linnakangas 2047 GIC 22648 : XLogRegisterBufData(1,
2118 tgl 2048 22648 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
3062 heikki.linnakangas 2049 22648 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
5904 bruce 2050 ECB :
1481 pg 2051 CBC 22648 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
3062 heikki.linnakangas 2052 22648 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2053 :
5904 alvherre 2054 22648 : PageSetLSN(origpage, recptr);
8222 vadim4o 2055 22648 : PageSetLSN(rightpage, recptr);
1091 pg 2056 GIC 22648 : if (!isrightmost)
8222 vadim4o 2057 CBC 11293 : PageSetLSN(spage, recptr);
3295 heikki.linnakangas 2058 22648 : if (!isleaf)
2545 kgrittn 2059 101 : PageSetLSN(BufferGetPage(cbuf), recptr);
8222 vadim4o 2060 ECB : }
2061 :
8111 tgl 2062 CBC 23479 : END_CRIT_SECTION();
2063 :
2064 : /* release the old right sibling */
1091 pg 2065 23479 : if (!isrightmost)
6218 tgl 2066 GIC 11299 : _bt_relbuf(rel, sbuf);
2067 :
3309 heikki.linnakangas 2068 ECB : /* release the child */
3309 heikki.linnakangas 2069 CBC 23479 : if (!isleaf)
3309 heikki.linnakangas 2070 GIC 101 : _bt_relbuf(rel, cbuf);
2071 :
1091 pg 2072 ECB : /* be tidy */
1091 pg 2073 CBC 23479 : if (isleaf)
1091 pg 2074 GIC 23378 : pfree(lefthighkey);
2075 :
9345 bruce 2076 ECB : /* split's done */
8986 bruce 2077 CBC 23479 : return rbuf;
2078 : }
2079 :
7352 tgl 2080 ECB : /*
2081 : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2082 : *
2083 : * On entry, buf and rbuf are the left and right split pages, which we
2084 : * still hold write locks on. Both locks will be released here. We
2085 : * release the rbuf lock once we have a write lock on the page that we
2086 : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2087 : * The lock on buf is released at the same point as the lock on the parent
2088 : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2089 : * atomic operation that completes the split by inserting a new downlink.
2090 : *
2091 : * stack - stack showing how we got here. Will be NULL when splitting true
2092 : * root, or during concurrent root split, where we can be inefficient
2093 : * isroot - we split the true root
2094 : * isonly - we split a page alone on its level (might have been fast root)
2095 : */
2096 : static void
7352 tgl 2097 GIC 23479 : _bt_insert_parent(Relation rel,
2098 : Relation heaprel,
2099 : Buffer buf,
2100 : Buffer rbuf,
7352 tgl 2101 ECB : BTStack stack,
2102 : bool isroot,
2103 : bool isonly)
2104 : {
2105 : /*
2106 : * Here we have to do something Lehman and Yao don't talk about: deal with
2107 : * a root split and construction of a new root. If our stack is empty
2108 : * then we have just split a node on what had been the root level when we
2109 : * descended the tree. If it was still the root then we perform a
2110 : * new-root construction. If it *wasn't* the root anymore, search to find
2111 : * the next higher level that someone constructed meanwhile, and find the
2112 : * right place to insert as for the normal case.
2113 : *
2114 : * If we have to search for the parent level, we do so by re-descending
2115 : * from the root. This is not super-efficient, but it's rare enough not
2116 : * to matter.
2117 : */
873 pg 2118 GIC 23479 : if (isroot)
2119 : {
2120 : Buffer rootbuf;
2121 :
7032 neilc 2122 CBC 2618 : Assert(stack == NULL);
873 pg 2123 GIC 2618 : Assert(isonly);
2124 : /* create a new root node and update the metapage */
8 andres 2125 GNC 2618 : rootbuf = _bt_newroot(rel, heaprel, buf, rbuf);
7352 tgl 2126 ECB : /* release the split buffers */
6218 tgl 2127 CBC 2618 : _bt_relbuf(rel, rootbuf);
6218 tgl 2128 GIC 2618 : _bt_relbuf(rel, rbuf);
6218 tgl 2129 CBC 2618 : _bt_relbuf(rel, buf);
2130 : }
7352 tgl 2131 ECB : else
2132 : {
7352 tgl 2133 CBC 20861 : BlockNumber bknum = BufferGetBlockNumber(buf);
7352 tgl 2134 GIC 20861 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2545 kgrittn 2135 20861 : Page page = BufferGetPage(buf);
2136 : IndexTuple new_item;
7352 tgl 2137 ECB : BTStackData fakestack;
6283 2138 : IndexTuple ritem;
7352 2139 : Buffer pbuf;
2140 :
7032 neilc 2141 GIC 20861 : if (stack == NULL)
2142 : {
2143 : BTPageOpaque opaque;
2144 :
3132 heikki.linnakangas 2145 CBC 12 : elog(DEBUG2, "concurrent ROOT page split");
373 michael 2146 GIC 12 : opaque = BTPageGetOpaque(page);
2147 :
2148 : /*
1125 pg 2149 ECB : * We should never reach here when a leaf page split takes place
2150 : * despite the insert of newitem being able to apply the fastpath
2151 : * optimization. Make sure of that with an assertion.
2152 : *
2153 : * This is more of a performance issue than a correctness issue.
2154 : * The fastpath won't have a descent stack. Using a phony stack
2155 : * here works, but never rely on that. The fastpath should be
2156 : * rejected within _bt_search_insert() when the rightmost leaf
2157 : * page will split, since it's faster to go through _bt_search()
2158 : * and get a stack in the usual way.
2159 : */
873 pg 2160 GIC 12 : Assert(!(P_ISLEAF(opaque) &&
2161 : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2162 :
2163 : /* Find the leftmost page at the next level up */
8 andres 2164 GNC 12 : pbuf = _bt_get_endpoint(rel, heaprel, opaque->btpo_level + 1, false,
2165 : NULL);
2166 : /* Set up a phony stack entry pointing there */
7352 tgl 2167 GIC 12 : stack = &fakestack;
2168 12 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
7352 tgl 2169 CBC 12 : stack->bts_offset = InvalidOffsetNumber;
7352 tgl 2170 GIC 12 : stack->bts_parent = NULL;
2171 12 : _bt_relbuf(rel, pbuf);
7352 tgl 2172 ECB : }
2173 :
1481 pg 2174 : /* get high key from left, a strict lower bound for new right page */
6283 tgl 2175 CBC 20861 : ritem = (IndexTuple) PageGetItem(page,
6283 tgl 2176 ECB : PageGetItemId(page, P_HIKEY));
2177 :
2178 : /* form an index tuple that points at the new right page */
6283 tgl 2179 GIC 20861 : new_item = CopyIndexTuple(ritem);
1210 pg 2180 CBC 20861 : BTreeTupleSetDownLink(new_item, rbknum);
2181 :
2182 : /*
2183 : * Re-find and write lock the parent of buf.
7352 tgl 2184 ECB : *
1418 2185 : * It's possible that the location of buf's downlink has changed since
2186 : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2187 : * and recover from this, updating the stack, which ensures that the
2188 : * new downlink will be inserted at the correct offset. Even buf's
2189 : * parent may have changed.
2190 : */
8 andres 2191 GNC 20861 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2192 :
2193 : /*
2194 : * Unlock the right child. The left child will be unlocked in
2195 : * _bt_insertonpg().
1108 pg 2196 ECB : *
2197 : * Unlocking the right child must be delayed until here to ensure that
2198 : * no concurrent VACUUM operation can become confused. Page deletion
2199 : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2200 : * (Actually, this is just a vestige of how things used to work. The
2201 : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2202 : * flag on the left child. It won't attempt deletion of the right
2203 : * child until the split is complete. Despite all this, we opt to
2204 : * conservatively delay unlocking the right child until here.)
2205 : */
6218 tgl 2206 GIC 20861 : _bt_relbuf(rel, rbuf);
2207 :
7352 2208 20861 : if (pbuf == InvalidBuffer)
1347 peter 2209 UIC 0 : ereport(ERROR,
2210 : (errcode(ERRCODE_INDEX_CORRUPTED),
1347 peter 2211 ECB : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2212 : RelationGetRelationName(rel), bknum, rbknum)));
7352 tgl 2213 :
1334 pg 2214 EUB : /* Recursively insert into the parent */
8 andres 2215 GNC 20861 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
1119 pg 2216 GIC 20861 : new_item, MAXALIGN(IndexTupleSize(new_item)),
873 2217 20861 : stack->bts_offset + 1, 0, isonly);
2218 :
2219 : /* be tidy */
7352 tgl 2220 CBC 20861 : pfree(new_item);
7352 tgl 2221 ECB : }
7352 tgl 2222 CBC 23479 : }
2223 :
2224 : /*
3309 heikki.linnakangas 2225 ECB : * _bt_finish_split() -- Finish an incomplete split
2226 : *
2227 : * A crash or other failure can leave a split incomplete. The insertion
2228 : * routines won't allow to insert on a page that is incompletely split.
2229 : * Before inserting on such a page, call _bt_finish_split().
2230 : *
2231 : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2232 : * and unpinned.
2233 : */
2234 : void
8 andres 2235 UNC 0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2236 : {
2545 kgrittn 2237 UIC 0 : Page lpage = BufferGetPage(lbuf);
373 michael 2238 0 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2239 : Buffer rbuf;
3309 heikki.linnakangas 2240 EUB : Page rpage;
2241 : BTPageOpaque rpageop;
873 pg 2242 : bool wasroot;
2243 : bool wasonly;
2244 :
3309 heikki.linnakangas 2245 UIC 0 : Assert(P_INCOMPLETE_SPLIT(lpageop));
2246 :
2247 : /* Lock right sibling, the one missing the downlink */
8 andres 2248 UNC 0 : rbuf = _bt_getbuf(rel, heaprel, lpageop->btpo_next, BT_WRITE);
2545 kgrittn 2249 UIC 0 : rpage = BufferGetPage(rbuf);
373 michael 2250 UBC 0 : rpageop = BTPageGetOpaque(rpage);
2251 :
2252 : /* Could this be a root split? */
3309 heikki.linnakangas 2253 0 : if (!stack)
3309 heikki.linnakangas 2254 EUB : {
2255 : Buffer metabuf;
2256 : Page metapg;
2257 : BTMetaPageData *metad;
2258 :
2259 : /* acquire lock on the metapage */
8 andres 2260 UNC 0 : metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
2545 kgrittn 2261 UIC 0 : metapg = BufferGetPage(metabuf);
3309 heikki.linnakangas 2262 0 : metad = BTPageGetMeta(metapg);
2263 :
873 pg 2264 0 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
3309 heikki.linnakangas 2265 EUB :
3309 heikki.linnakangas 2266 UBC 0 : _bt_relbuf(rel, metabuf);
3309 heikki.linnakangas 2267 EUB : }
2268 : else
873 pg 2269 UBC 0 : wasroot = false;
2270 :
3309 heikki.linnakangas 2271 EUB : /* Was this the only page on the level before split? */
873 pg 2272 UIC 0 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2273 :
3309 heikki.linnakangas 2274 UBC 0 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2275 : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2276 :
8 andres 2277 UNC 0 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
3309 heikki.linnakangas 2278 UIC 0 : }
3309 heikki.linnakangas 2279 EUB :
2280 : /*
2281 : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
1334 pg 2282 : * tuple whose downlink points to child page.
8297 tgl 2283 : *
2284 : * Caller passes child's block number, which is used to identify
2285 : * associated pivot tuple in parent page using a linear search that
2286 : * matches on pivot's downlink/block number. The expected location of
2287 : * the pivot tuple is taken from the stack one level above the child
2288 : * page. This is used as a starting point. Insertions into the
2289 : * parent level could cause the pivot tuple to move right; deletions
2290 : * could cause it to move left, but not left of the page we previously
2291 : * found it on.
2292 : *
2293 : * Caller can use its stack to relocate the pivot tuple/downlink for
2294 : * any same-level page to the right of the page found by its initial
2295 : * descent. This is necessary because of the possibility that caller
2296 : * moved right to recover from a concurrent page split. It's also
2297 : * convenient for certain callers to be able to step right when there
2298 : * wasn't a concurrent page split, while still using their original
2299 : * stack. For example, the checkingunique _bt_doinsert() case may
2300 : * have to step right when there are many physical duplicates, and its
2301 : * scantid forces an insertion to the right of the "first page the
2302 : * value could be on". (This is also relied on by all of our callers
2303 : * when dealing with !heapkeyspace indexes.)
2304 : *
2305 : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2306 : * tuple not found (should not happen). Adjusts bts_blkno &
2307 : * bts_offset if changed. Page split caller should insert its new
2308 : * pivot tuple for its new right sibling page on parent page, at the
2309 : * offset number bts_offset + 1.
2310 : */
2311 : Buffer
8 andres 2312 GNC 23619 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2313 : {
2314 : BlockNumber blkno;
2315 : OffsetNumber start;
2316 :
8297 tgl 2317 CBC 23619 : blkno = stack->bts_blkno;
8297 tgl 2318 GIC 23619 : start = stack->bts_offset;
2319 :
2320 : for (;;)
2321 5 : {
7352 tgl 2322 ECB : Buffer buf;
2323 : Page page;
2324 : BTPageOpaque opaque;
2325 :
8 andres 2326 GNC 23624 : buf = _bt_getbuf(rel, heaprel, blkno, BT_WRITE);
2545 kgrittn 2327 GIC 23624 : page = BufferGetPage(buf);
373 michael 2328 23624 : opaque = BTPageGetOpaque(page);
2329 :
1504 pg 2330 23624 : if (P_INCOMPLETE_SPLIT(opaque))
3309 heikki.linnakangas 2331 ECB : {
8 andres 2332 UNC 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
3309 heikki.linnakangas 2333 LBC 0 : continue;
2334 : }
3309 heikki.linnakangas 2335 ECB :
7351 tgl 2336 GIC 23624 : if (!P_IGNORE(opaque))
8297 tgl 2337 EUB : {
7351 2338 : OffsetNumber offnum,
2339 : minoff,
2340 : maxoff;
7351 tgl 2341 ECB : ItemId itemid;
2342 : IndexTuple item;
2343 :
7351 tgl 2344 GIC 23622 : minoff = P_FIRSTDATAKEY(opaque);
2345 23622 : maxoff = PageGetMaxOffsetNumber(page);
2346 :
2347 : /*
2348 : * start = InvalidOffsetNumber means "search the whole page". We
6385 bruce 2349 ECB : * need this test anyway due to possibility that page has a high
2350 : * key now when it didn't before.
2351 : */
7351 tgl 2352 GIC 23622 : if (start < minoff)
2353 17 : start = minoff;
2354 :
2355 : /*
2356 : * Need this check too, to guard against possibility that page
6809 tgl 2357 ECB : * split since we visited it originally.
2358 : */
6809 tgl 2359 GIC 23622 : if (start > maxoff)
2360 3 : start = OffsetNumberNext(maxoff);
2361 :
2362 : /*
2363 : * These loops will check every item on the page --- but in an
6385 bruce 2364 ECB : * order that's attuned to the probability of where it actually
3260 2365 : * is. Scan to the right first, then to the left.
2366 : */
7351 tgl 2367 GIC 23622 : for (offnum = start;
2368 24048 : offnum <= maxoff;
2369 426 : offnum = OffsetNumberNext(offnum))
2370 : {
2371 24045 : itemid = PageGetItemId(page, offnum);
6283 tgl 2372 CBC 24045 : item = (IndexTuple) PageGetItem(page, itemid);
1828 teodor 2373 ECB :
1210 pg 2374 CBC 24045 : if (BTreeTupleGetDownLink(item) == child)
2375 : {
7351 tgl 2376 ECB : /* Return accurate pointer to where link is now */
7351 tgl 2377 CBC 23619 : stack->bts_blkno = blkno;
7351 tgl 2378 GIC 23619 : stack->bts_offset = offnum;
7351 tgl 2379 CBC 23619 : return buf;
2380 : }
2381 : }
8053 bruce 2382 ECB :
7351 tgl 2383 CBC 3 : for (offnum = OffsetNumberPrev(start);
2384 858 : offnum >= minoff;
7351 tgl 2385 GIC 855 : offnum = OffsetNumberPrev(offnum))
2386 : {
2387 855 : itemid = PageGetItemId(page, offnum);
6283 tgl 2388 CBC 855 : item = (IndexTuple) PageGetItem(page, itemid);
1828 teodor 2389 ECB :
1210 pg 2390 CBC 855 : if (BTreeTupleGetDownLink(item) == child)
2391 : {
7351 tgl 2392 ECB : /* Return accurate pointer to where link is now */
7351 tgl 2393 LBC 0 : stack->bts_blkno = blkno;
7351 tgl 2394 UIC 0 : stack->bts_offset = offnum;
7351 tgl 2395 LBC 0 : return buf;
2396 : }
2397 : }
7352 tgl 2398 EUB : }
2399 :
8053 bruce 2400 : /*
2401 : * The item we're looking for moved right at least one page.
2402 : *
2403 : * Lehman and Yao couple/chain locks when moving right here, which we
2404 : * can avoid. See nbtree/README.
2405 : */
8297 tgl 2406 GIC 5 : if (P_RIGHTMOST(opaque))
2407 : {
7938 tgl 2408 UIC 0 : _bt_relbuf(rel, buf);
6927 2409 0 : return InvalidBuffer;
2410 : }
8297 tgl 2411 CBC 5 : blkno = opaque->btpo_next;
7352 tgl 2412 GIC 5 : start = InvalidOffsetNumber;
7938 tgl 2413 GBC 5 : _bt_relbuf(rel, buf);
8297 tgl 2414 EUB : }
2415 : }
9770 scrappy 2416 ECB :
2417 : /*
9345 bruce 2418 : * _bt_newroot() -- Create a new root page for the index.
2419 : *
2420 : * We've just split the old root page and need to create a new one.
2421 : * In order to do this, we add a new root page to the file, then lock
2422 : * the metadata page and update it. This is guaranteed to be deadlock-
2423 : * free, because all readers release their locks on the metadata page
2424 : * before trying to lock the root, and all writers lock the root before
2425 : * trying to lock the metadata page. We have a write lock on the old
2426 : * root page, so we have not introduced any cycles into the waits-for
2427 : * graph.
2428 : *
2429 : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2430 : * locked. On exit, a new root page exists with entries for the
2431 : * two new children, metapage is updated and unlocked/unpinned.
2432 : * The new root buffer is returned to caller which has to unlock/unpin
2433 : * lbuf, rbuf & rootbuf.
2434 : */
2435 : static Buffer
8 andres 2436 GNC 2618 : _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2437 : {
2438 : Buffer rootbuf;
2439 : Page lpage,
2440 : rootpage;
8053 bruce 2441 ECB : BlockNumber lbkno,
2442 : rbkno;
2443 : BlockNumber rootblknum;
2444 : BTPageOpaque rootopaque;
2445 : BTPageOpaque lopaque;
2446 : ItemId itemid;
2447 : IndexTuple item;
2448 : IndexTuple left_item;
2449 : Size left_item_sz;
2450 : IndexTuple right_item;
2451 : Size right_item_sz;
2452 : Buffer metabuf;
2453 : Page metapg;
2454 : BTMetaPageData *metad;
2455 :
7352 tgl 2456 GIC 2618 : lbkno = BufferGetBlockNumber(lbuf);
2457 2618 : rbkno = BufferGetBlockNumber(rbuf);
2545 kgrittn 2458 2618 : lpage = BufferGetPage(lbuf);
373 michael 2459 2618 : lopaque = BTPageGetOpaque(lpage);
2460 :
9345 bruce 2461 ECB : /* get a new root page */
8 andres 2462 GNC 2618 : rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
2545 kgrittn 2463 CBC 2618 : rootpage = BufferGetPage(rootbuf);
8222 vadim4o 2464 2618 : rootblknum = BufferGetBlockNumber(rootbuf);
2465 :
2466 : /* acquire lock on the metapage */
8 andres 2467 GNC 2618 : metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
2545 kgrittn 2468 CBC 2618 : metapg = BufferGetPage(metabuf);
8137 vadim4o 2469 2618 : metad = BTPageGetMeta(metapg);
2470 :
2471 : /*
1091 pg 2472 ECB : * Create downlink item for left page (old root). The key value used is
2473 : * "minus infinity", a sentinel value that's reliably less than any real
2474 : * key value that could appear in the left page.
2475 : */
3292 heikki.linnakangas 2476 GIC 2618 : left_item_sz = sizeof(IndexTupleData);
2477 2618 : left_item = (IndexTuple) palloc(left_item_sz);
2478 2618 : left_item->t_info = left_item_sz;
1210 pg 2479 2618 : BTreeTupleSetDownLink(left_item, lbkno);
1097 2480 2618 : BTreeTupleSetNAtts(left_item, 0, false);
3292 heikki.linnakangas 2481 ECB :
2482 : /*
2483 : * Create downlink item for right page. The key for it is obtained from
2484 : * the "high key" position in the left page.
2485 : */
3292 heikki.linnakangas 2486 GIC 2618 : itemid = PageGetItemId(lpage, P_HIKEY);
2487 2618 : right_item_sz = ItemIdGetLength(itemid);
2488 2618 : item = (IndexTuple) PageGetItem(lpage, itemid);
2489 2618 : right_item = CopyIndexTuple(item);
1210 pg 2490 2618 : BTreeTupleSetDownLink(right_item, rbkno);
3292 heikki.linnakangas 2491 ECB :
7202 tgl 2492 : /* NO EREPORT(ERROR) from here till newroot op is logged */
8122 tgl 2493 CBC 2618 : START_CRIT_SECTION();
9345 bruce 2494 ECB :
1775 teodor 2495 : /* upgrade metapage if needed */
1481 pg 2496 GIC 2618 : if (metad->btm_version < BTREE_NOVAC_VERSION)
1775 teodor 2497 UIC 0 : _bt_upgrademetapage(metapg);
1775 teodor 2498 ECB :
2499 : /* set btree special data */
373 michael 2500 GIC 2618 : rootopaque = BTPageGetOpaque(rootpage);
9345 bruce 2501 CBC 2618 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
7352 tgl 2502 GBC 2618 : rootopaque->btpo_flags = BTP_ROOT;
774 pg 2503 GIC 2618 : rootopaque->btpo_level =
373 michael 2504 2618 : (BTPageGetOpaque(lpage))->btpo_level + 1;
6180 tgl 2505 CBC 2618 : rootopaque->btpo_cycleid = 0;
9345 bruce 2506 ECB :
7352 tgl 2507 : /* update metapage data */
7352 tgl 2508 CBC 2618 : metad->btm_root = rootblknum;
774 pg 2509 2618 : metad->btm_level = rootopaque->btpo_level;
7352 tgl 2510 2618 : metad->btm_fastroot = rootblknum;
774 pg 2511 GIC 2618 : metad->btm_fastlevel = rootopaque->btpo_level;
2512 :
9345 bruce 2513 ECB : /*
6385 2514 : * Insert the left page pointer into the new root page. The root page is
2515 : * the rightmost page on its level so there is no "high key" in it; the
2516 : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2517 : *
2518 : * Note: we *must* insert the two items in item-number order, for the
2519 : * benefit of _bt_restore_page().
2520 : */
1816 teodor 2521 GIC 2618 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
3292 heikki.linnakangas 2522 2618 : if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2523 : false, false) == InvalidOffsetNumber)
5578 tgl 2524 UIC 0 : elog(PANIC, "failed to add leftkey to new root page"
2525 : " while splitting block %u of index \"%s\"",
5578 tgl 2526 ECB : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
9345 bruce 2527 :
2528 : /*
9345 bruce 2529 EUB : * insert the right page pointer into the new root page.
2530 : */
1481 pg 2531 GIC 2618 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2532 2618 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2533 : IndexRelationGetNumberOfKeyAttributes(rel));
3292 heikki.linnakangas 2534 2618 : if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2535 : false, false) == InvalidOffsetNumber)
5578 tgl 2536 LBC 0 : elog(PANIC, "failed to add rightkey to new root page"
5578 tgl 2537 ECB : " while splitting block %u of index \"%s\"",
2538 : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
9345 bruce 2539 :
2540 : /* Clear the incomplete-split flag in the left child */
3309 heikki.linnakangas 2541 GBC 2618 : Assert(P_INCOMPLETE_SPLIT(lopaque));
3309 heikki.linnakangas 2542 GIC 2618 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2543 2618 : MarkBufferDirty(lbuf);
2544 :
6218 tgl 2545 2618 : MarkBufferDirty(rootbuf);
6218 tgl 2546 CBC 2618 : MarkBufferDirty(metabuf);
6218 tgl 2547 ECB :
8222 vadim4o 2548 : /* XLOG stuff */
4500 rhaas 2549 GIC 2618 : if (RelationNeedsWAL(rel))
8222 vadim4o 2550 ECB : {
8053 bruce 2551 : xl_btree_newroot xlrec;
2552 : XLogRecPtr recptr;
2553 : xl_btree_metadata md;
8213 vadim4o 2554 :
7352 tgl 2555 GIC 2605 : xlrec.rootblk = rootblknum;
8137 vadim4o 2556 2605 : xlrec.level = metad->btm_level;
2557 :
3062 heikki.linnakangas 2558 2605 : XLogBeginInsert();
2559 2605 : XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
3062 heikki.linnakangas 2560 ECB :
3062 heikki.linnakangas 2561 CBC 2605 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
3062 heikki.linnakangas 2562 GIC 2605 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
1983 tgl 2563 CBC 2605 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
3062 heikki.linnakangas 2564 ECB :
1481 pg 2565 GIC 2605 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1481 pg 2566 CBC 2605 : md.version = metad->btm_version;
3062 heikki.linnakangas 2567 2605 : md.root = rootblknum;
2568 2605 : md.level = metad->btm_level;
3062 heikki.linnakangas 2569 GIC 2605 : md.fastroot = rootblknum;
3062 heikki.linnakangas 2570 CBC 2605 : md.fastlevel = metad->btm_level;
774 pg 2571 2605 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1138 2572 2605 : md.allequalimage = metad->btm_allequalimage;
3062 heikki.linnakangas 2573 ECB :
3062 heikki.linnakangas 2574 CBC 2605 : XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
8222 vadim4o 2575 ECB :
8053 bruce 2576 : /*
6385 2577 : * Direct access to page is not good but faster - we should implement
2578 : * some new func in page API.
8222 vadim4o 2579 : */
3062 heikki.linnakangas 2580 GIC 2605 : XLogRegisterBufData(0,
2118 tgl 2581 2605 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
3062 heikki.linnakangas 2582 2605 : ((PageHeader) rootpage)->pd_special -
2583 2605 : ((PageHeader) rootpage)->pd_upper);
2584 :
3062 heikki.linnakangas 2585 CBC 2605 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
8213 vadim4o 2586 ECB :
3274 heikki.linnakangas 2587 CBC 2605 : PageSetLSN(lpage, recptr);
8222 vadim4o 2588 2605 : PageSetLSN(rootpage, recptr);
8213 vadim4o 2589 GIC 2605 : PageSetLSN(metapg, recptr);
8222 vadim4o 2590 ECB : }
2591 :
8122 tgl 2592 CBC 2618 : END_CRIT_SECTION();
8222 vadim4o 2593 ECB :
6218 tgl 2594 : /* done with metapage */
6218 tgl 2595 GIC 2618 : _bt_relbuf(rel, metabuf);
2596 :
3292 heikki.linnakangas 2597 CBC 2618 : pfree(left_item);
3292 heikki.linnakangas 2598 GIC 2618 : pfree(right_item);
2599 :
6297 neilc 2600 CBC 2618 : return rootbuf;
2601 : }
8108 vadim4o 2602 ECB :
9770 scrappy 2603 : /*
2604 : * _bt_pgaddtup() -- add a data item to a particular page during split.
2605 : *
2606 : * The difference between this routine and a bare PageAddItem call is
2607 : * that this code can deal with the first data item on an internal btree
2608 : * page in passing. This data item (which is called "firstright" within
2609 : * _bt_split()) has a key that must be treated as minus infinity after
2610 : * the split. Therefore, we truncate away all attributes when caller
2611 : * specifies it's the first data item on page (downlink is not changed,
2612 : * though). This extra step is only needed for the right page of an
2613 : * internal page split. There is no need to do this for the first data
2614 : * item on the existing/left page, since that will already have been
2615 : * truncated during an earlier page split.
2616 : *
2617 : * See _bt_split() for a high level explanation of why we truncate here.
2618 : * Note that this routine has nothing to do with suffix truncation,
2619 : * despite using some of the same infrastructure.
2620 : */
2621 : static inline bool
4606 tgl 2622 GIC 6963551 : _bt_pgaddtup(Page page,
2623 : Size itemsize,
2624 : IndexTuple itup,
2625 : OffsetNumber itup_off,
2626 : bool newfirstdataitem)
9770 scrappy 2627 ECB : {
2628 : IndexTupleData trunctuple;
2629 :
1091 pg 2630 GIC 6963551 : if (newfirstdataitem)
2631 : {
6283 tgl 2632 101 : trunctuple = *itup;
2633 101 : trunctuple.t_info = sizeof(IndexTupleData);
1097 pg 2634 101 : BTreeTupleSetNAtts(&trunctuple, 0, false);
6283 tgl 2635 CBC 101 : itup = &trunctuple;
6283 tgl 2636 GIC 101 : itemsize = sizeof(IndexTupleData);
8451 tgl 2637 ECB : }
2638 :
1091 pg 2639 CBC 6963551 : if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
1091 pg 2640 ECB : false) == InvalidOffsetNumber))
4606 tgl 2641 LBC 0 : return false;
2642 :
4606 tgl 2643 GIC 6963551 : return true;
9770 scrappy 2644 ECB : }
2645 :
6102 tgl 2646 EUB : /*
2647 : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
873 pg 2648 ECB : *
2649 : * There are three operations performed here: simple index deletion, bottom-up
2650 : * index deletion, and deduplication. If all three operations fail to free
2651 : * enough space for the incoming item then caller will go on to split the
2652 : * page. We always consider simple deletion first. If that doesn't work out
2653 : * we consider alternatives. Callers that only want us to consider simple
2654 : * deletion (without any fallback) ask for that using the 'simpleonly'
2655 : * argument.
2656 : *
2657 : * We usually pick only one alternative "complex" operation when simple
2658 : * deletion alone won't prevent a page split. The 'checkingunique',
2659 : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2660 : *
2661 : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2662 : * level flag was found set. The flag was useful back when there wasn't
2663 : * necessarily one single page for a duplicate tuple to go on (before heap TID
2664 : * became a part of the key space in version 4 indexes). But we don't
2665 : * actually look at the flag anymore (it's not a gating condition for our
2666 : * caller). That would cause us to miss tuples that are safe to delete,
2667 : * without getting any benefit in return. We know that the alternative is to
2668 : * split the page; scanning the line pointer array in passing won't have
2669 : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2670 : * all this because !heapkeyspace indexes must still do a "getting tired"
2671 : * linear search, and so are likely to get some benefit from using it as a
2672 : * gating condition.)
2673 : */
2674 : static void
873 pg 2675 GIC 50681 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2676 : BTInsertState insertstate,
2677 : bool simpleonly, bool checkingunique,
2678 : bool uniquedup, bool indexUnchanged)
2679 : {
1138 pg 2680 ECB : OffsetNumber deletable[MaxIndexTuplesPerPage];
6031 bruce 2681 GIC 50681 : int ndeletable = 0;
2682 : OffsetNumber offnum,
2683 : minoff,
2684 : maxoff;
873 pg 2685 50681 : Buffer buffer = insertstate->buf;
873 pg 2686 CBC 50681 : BTScanInsert itup_key = insertstate->itup_key;
2545 kgrittn 2687 GIC 50681 : Page page = BufferGetPage(buffer);
373 michael 2688 50681 : BTPageOpaque opaque = BTPageGetOpaque(page);
2689 :
1481 pg 2690 CBC 50681 : Assert(P_ISLEAF(opaque));
816 2691 50681 : Assert(simpleonly || itup_key->heapkeyspace);
2692 50681 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
1481 pg 2693 ECB :
2694 : /*
5624 bruce 2695 : * Scan over all items to see which ones need to be deleted according to
816 pg 2696 : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2697 : * are not marked LP_DEAD in passing. Often the extra items that actually
2698 : * end up getting deleted are items that would have had their LP_DEAD bit
2699 : * set before long anyway (if we opted not to include them as extras).
2700 : */
816 pg 2701 GIC 50681 : minoff = P_FIRSTDATAKEY(opaque);
6102 tgl 2702 50681 : maxoff = PageGetMaxOffsetNumber(page);
816 pg 2703 50681 : for (offnum = minoff;
6102 tgl 2704 13208843 : offnum <= maxoff;
2705 13158162 : offnum = OffsetNumberNext(offnum))
6102 tgl 2706 ECB : {
6031 bruce 2707 CBC 13158162 : ItemId itemId = PageGetItemId(page, offnum);
6102 tgl 2708 ECB :
5688 tgl 2709 CBC 13158162 : if (ItemIdIsDead(itemId))
6102 2710 120725 : deletable[ndeletable++] = offnum;
2711 : }
6102 tgl 2712 ECB :
6102 tgl 2713 GIC 50681 : if (ndeletable > 0)
873 pg 2714 ECB : {
816 pg 2715 CBC 4769 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2716 : insertstate->itup, minoff, maxoff);
873 pg 2717 GIC 4769 : insertstate->bounds_valid = false;
873 pg 2718 ECB :
2719 : /* Return when a page split has already been avoided */
873 pg 2720 CBC 4769 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
873 pg 2721 GIC 22013 : return;
873 pg 2722 ECB :
2723 : /* Might as well assume duplicates (if checkingunique) */
873 pg 2724 GIC 304 : uniquedup = true;
873 pg 2725 ECB : }
2726 :
2727 : /*
2728 : * We're done with simple deletion. Return early with callers that only
816 2729 : * call here so that simple deletion can be considered. This includes
2730 : * callers that explicitly ask for this and checkingunique callers that
2731 : * probably don't have any version churn duplicates on the page.
2732 : *
2733 : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2734 : * return at this point (or when we go on the try either or both of our
2735 : * other strategies and they also fail). We do not bother expending a
2736 : * separate write to clear it, however. Caller will definitely clear it
2737 : * when it goes on to split the page (note also that the deduplication
2738 : * process will clear the flag in passing, just to keep things tidy).
2739 : */
816 pg 2740 GIC 46216 : if (simpleonly || (checkingunique && !uniquedup))
2741 : {
2742 17413 : Assert(!indexUnchanged);
873 2743 17413 : return;
2744 : }
873 pg 2745 ECB :
2746 : /* Assume bounds about to be invalidated (this is almost certain now) */
873 pg 2747 CBC 28803 : insertstate->bounds_valid = false;
6031 bruce 2748 ECB :
2749 : /*
2750 : * Perform bottom-up index deletion pass when executor hint indicated that
2751 : * incoming item is logically unchanged, or for a unique index that is
816 pg 2752 : * known to have physical duplicates for some other reason. (There is a
2753 : * large overlap between these two cases for a unique index. It's worth
2754 : * having both triggering conditions in order to apply the optimization in
2755 : * the event of successive related INSERT and DELETE statements.)
2756 : *
2757 : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2758 : * delete an acceptable amount of free space (a significant fraction of
2759 : * the page, or space for the new item, whichever is greater).
2760 : *
2761 : * Note: Bottom-up index deletion uses the same equality/equivalence
2762 : * routines as deduplication internally. However, it does not merge
2763 : * together index tuples, so the same correctness considerations do not
2764 : * apply. We deliberately omit an index-is-allequalimage test here.
2765 : */
816 pg 2766 GIC 32315 : if ((indexUnchanged || uniquedup) &&
2767 3512 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2768 135 : return;
2769 :
2770 : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
873 pg 2771 CBC 28668 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2772 28659 : _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
565 2773 28659 : insertstate->itemsz, (indexUnchanged || uniquedup));
2774 : }
2775 :
816 pg 2776 ECB : /*
2777 : * _bt_simpledel_pass - Simple index tuple deletion pass.
2778 : *
2779 : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2780 : * of all such tuples are determined by caller (caller passes these to us as
2781 : * its 'deletable' argument).
2782 : *
2783 : * We might also delete extra index tuples that turn out to be safe to delete
2784 : * in passing (though they must be cheap to check in passing to begin with).
2785 : * There is no certainty that any extra tuples will be deleted, though. The
2786 : * high level goal of the approach we take is to get the most out of each call
2787 : * here (without noticeably increasing the per-call overhead compared to what
2788 : * we need to do just to be able to delete the page's LP_DEAD-marked index
2789 : * tuples).
2790 : *
2791 : * The number of extra index tuples that turn out to be deletable might
2792 : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2793 : * locality related effects. For example, it's possible that the total number
2794 : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2795 : * quite low, in which case we might end up checking if it's possible to
2796 : * delete _most_ index tuples on the page (without the tableam needing to
2797 : * access additional table blocks). The tableam will sometimes stumble upon
2798 : * _many_ extra deletable index tuples in indexes where this pattern is
2799 : * common.
2800 : *
2801 : * See nbtree/README for further details on simple index tuple deletion.
2802 : */
2803 : static void
816 pg 2804 GIC 4769 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2805 : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2806 : OffsetNumber minoff, OffsetNumber maxoff)
2807 : {
2808 4769 : Page page = BufferGetPage(buffer);
816 pg 2809 ECB : BlockNumber *deadblocks;
2810 : int ndeadblocks;
2811 : TM_IndexDeleteOp delstate;
2812 : OffsetNumber offnum;
2813 :
2814 : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
816 pg 2815 GIC 4769 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2816 : &ndeadblocks);
2817 :
2818 : /* Initialize tableam state that describes index deletion operation */
521 2819 4769 : delstate.irel = rel;
521 pg 2820 CBC 4769 : delstate.iblknum = BufferGetBlockNumber(buffer);
816 pg 2821 GIC 4769 : delstate.bottomup = false;
2822 4769 : delstate.bottomupfreespace = 0;
2823 4769 : delstate.ndeltids = 0;
816 pg 2824 CBC 4769 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2825 4769 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
816 pg 2826 ECB :
816 pg 2827 CBC 4769 : for (offnum = minoff;
2828 1219140 : offnum <= maxoff;
2829 1214371 : offnum = OffsetNumberNext(offnum))
816 pg 2830 ECB : {
816 pg 2831 GIC 1214371 : ItemId itemid = PageGetItemId(page, offnum);
816 pg 2832 CBC 1214371 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2833 1214371 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2834 1214371 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2835 : BlockNumber tidblock;
816 pg 2836 ECB : void *match;
2837 :
816 pg 2838 CBC 1214371 : if (!BTreeTupleIsPosting(itup))
816 pg 2839 ECB : {
816 pg 2840 GIC 1179738 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2841 1179738 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2842 : sizeof(BlockNumber), _bt_blk_cmp);
816 pg 2843 ECB :
816 pg 2844 GIC 1179738 : if (!match)
816 pg 2845 ECB : {
816 pg 2846 CBC 790451 : Assert(!ItemIdIsDead(itemid));
816 pg 2847 GIC 790451 : continue;
2848 : }
816 pg 2849 ECB :
2850 : /*
2851 : * TID's table block is among those pointed to by the TIDs from
2852 : * LP_DEAD-bit set tuples on page -- add TID to deltids
2853 : */
816 pg 2854 GIC 389287 : odeltid->tid = itup->t_tid;
2855 389287 : odeltid->id = delstate.ndeltids;
2856 389287 : ostatus->idxoffnum = offnum;
2857 389287 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2858 389287 : ostatus->promising = false; /* unused */
816 pg 2859 CBC 389287 : ostatus->freespace = 0; /* unused */
816 pg 2860 ECB :
816 pg 2861 CBC 389287 : delstate.ndeltids++;
816 pg 2862 ECB : }
2863 : else
2864 : {
816 pg 2865 GIC 34633 : int nitem = BTreeTupleGetNPosting(itup);
816 pg 2866 ECB :
816 pg 2867 GIC 181092 : for (int p = 0; p < nitem; p++)
2868 : {
2869 146459 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
816 pg 2870 ECB :
816 pg 2871 GIC 146459 : tidblock = ItemPointerGetBlockNumber(tid);
816 pg 2872 CBC 146459 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2873 : sizeof(BlockNumber), _bt_blk_cmp);
816 pg 2874 ECB :
816 pg 2875 GIC 146459 : if (!match)
816 pg 2876 ECB : {
816 pg 2877 CBC 127648 : Assert(!ItemIdIsDead(itemid));
816 pg 2878 GIC 127648 : continue;
2879 : }
816 pg 2880 ECB :
2881 : /*
2882 : * TID's table block is among those pointed to by the TIDs
2883 : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2884 : */
816 pg 2885 GIC 18811 : odeltid->tid = *tid;
2886 18811 : odeltid->id = delstate.ndeltids;
2887 18811 : ostatus->idxoffnum = offnum;
2888 18811 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2889 18811 : ostatus->promising = false; /* unused */
816 pg 2890 CBC 18811 : ostatus->freespace = 0; /* unused */
816 pg 2891 ECB :
816 pg 2892 CBC 18811 : odeltid++;
2893 18811 : ostatus++;
2894 18811 : delstate.ndeltids++;
816 pg 2895 ECB : }
2896 : }
2897 : }
2898 :
816 pg 2899 CBC 4769 : pfree(deadblocks);
2900 :
816 pg 2901 GIC 4769 : Assert(delstate.ndeltids >= ndeletable);
2902 :
2903 : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
816 pg 2904 CBC 4769 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2905 :
2906 4769 : pfree(delstate.deltids);
816 pg 2907 GIC 4769 : pfree(delstate.status);
2908 4769 : }
816 pg 2909 ECB :
2910 : /*
2911 : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2912 : *
2913 : * Builds sorted and unique-ified array of table block numbers from index
2914 : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2915 : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2916 : * table blocks.
2917 : *
2918 : * Always counting the newitem's table block as an LP_DEAD related block makes
2919 : * sense because the cost is consistently low; it is practically certain that
2920 : * the table block will not incur a buffer miss in tableam. On the other hand
2921 : * the benefit is often quite high. There is a decent chance that there will
2922 : * be some deletable items from this block, since in general most garbage
2923 : * tuples became garbage in the recent past (in many cases this won't be the
2924 : * first logical row that core code added to/modified in table block
2925 : * recently).
2926 : *
2927 : * Returns final array, and sets *nblocks to its final size for caller.
2928 : */
2929 : static BlockNumber *
816 pg 2930 GIC 4769 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2931 : IndexTuple newitem, int *nblocks)
2932 : {
2933 : int spacentids,
2934 : ntids;
816 pg 2935 ECB : BlockNumber *tidblocks;
2936 :
2937 : /*
2938 : * Accumulate each TID's block in array whose initial size has space for
2939 : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2940 : * block). Array will only need to grow when there are LP_DEAD-marked
2941 : * posting list tuples (which is not that common).
2942 : */
816 pg 2943 GIC 4769 : spacentids = ndeletable + 1;
2944 4769 : ntids = 0;
2945 4769 : tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2946 :
2947 : /*
816 pg 2948 ECB : * First add the table block for the incoming newitem. This is the one
2949 : * case where simple deletion can visit a table block that doesn't have
2950 : * any known deletable items.
2951 : */
816 pg 2952 GIC 4769 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2953 4769 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2954 :
2955 125494 : for (int i = 0; i < ndeletable; i++)
2956 : {
816 pg 2957 CBC 120725 : ItemId itemid = PageGetItemId(page, deletable[i]);
2958 120725 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2959 :
2960 120725 : Assert(ItemIdIsDead(itemid));
2961 :
2962 120725 : if (!BTreeTupleIsPosting(itup))
816 pg 2963 ECB : {
816 pg 2964 GIC 117259 : if (ntids + 1 > spacentids)
816 pg 2965 ECB : {
816 pg 2966 GIC 88 : spacentids *= 2;
816 pg 2967 ECB : tidblocks = (BlockNumber *)
816 pg 2968 GIC 88 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
816 pg 2969 ECB : }
2970 :
816 pg 2971 CBC 117259 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2972 : }
816 pg 2973 ECB : else
2974 : {
816 pg 2975 GIC 3466 : int nposting = BTreeTupleGetNPosting(itup);
816 pg 2976 ECB :
816 pg 2977 GIC 3466 : if (ntids + nposting > spacentids)
2978 : {
2979 82 : spacentids = Max(spacentids * 2, ntids + nposting);
816 pg 2980 ECB : tidblocks = (BlockNumber *)
816 pg 2981 GIC 82 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
816 pg 2982 ECB : }
2983 :
816 pg 2984 CBC 11792 : for (int j = 0; j < nposting; j++)
2985 : {
2986 8326 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2987 :
816 pg 2988 GIC 8326 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
816 pg 2989 ECB : }
2990 : }
2991 : }
2992 :
816 pg 2993 CBC 4769 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
816 pg 2994 GIC 4769 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
2995 :
2996 4769 : return tidblocks;
2997 : }
816 pg 2998 ECB :
2999 : /*
3000 : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3001 : */
3002 : static inline int
816 pg 3003 GIC 2888450 : _bt_blk_cmp(const void *arg1, const void *arg2)
3004 : {
3005 2888450 : BlockNumber b1 = *((BlockNumber *) arg1);
3006 2888450 : BlockNumber b2 = *((BlockNumber *) arg2);
3007 :
816 pg 3008 CBC 2888450 : if (b1 < b2)
816 pg 3009 GIC 1570073 : return -1;
816 pg 3010 CBC 1318377 : else if (b1 > b2)
3011 660386 : return 1;
3012 :
3013 657991 : return 0;
816 pg 3014 ECB : }
|