Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtinsert.c
4 : : * Item insertion in Lehman and Yao btrees for Postgres.
5 : : *
6 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtinsert.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/nbtxlog.h"
20 : : #include "access/transam.h"
21 : : #include "access/xloginsert.h"
22 : : #include "common/int.h"
23 : : #include "common/pg_prng.h"
24 : : #include "lib/qunique.h"
25 : : #include "miscadmin.h"
26 : : #include "storage/lmgr.h"
27 : : #include "storage/predicate.h"
28 : :
29 : : /* Minimum tree height for application of fastpath optimization */
30 : : #define BTREE_FASTPATH_MIN_LEVEL 2
31 : :
32 : :
33 : : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
34 : : BTInsertState insertstate);
35 : : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
36 : : Relation heapRel,
37 : : IndexUniqueCheck checkUnique, bool *is_unique,
38 : : uint32 *speculativeToken);
39 : : static OffsetNumber _bt_findinsertloc(Relation rel,
40 : : BTInsertState insertstate,
41 : : bool checkingunique,
42 : : bool indexUnchanged,
43 : : BTStack stack,
44 : : Relation heapRel);
45 : : static void _bt_stepright(Relation rel, Relation heaprel,
46 : : BTInsertState insertstate, BTStack stack);
47 : : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
48 : : Buffer buf,
49 : : Buffer cbuf,
50 : : BTStack stack,
51 : : IndexTuple itup,
52 : : Size itemsz,
53 : : OffsetNumber newitemoff,
54 : : int postingoff,
55 : : bool split_only_page);
56 : : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
57 : : Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
58 : : Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
59 : : IndexTuple nposting, uint16 postingoff);
60 : : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
61 : : Buffer rbuf, BTStack stack, bool isroot, bool isonly);
62 : : static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
63 : : static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
64 : : OffsetNumber itup_off, bool newfirstdataitem);
65 : : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
66 : : BTInsertState insertstate,
67 : : bool simpleonly, bool checkingunique,
68 : : bool uniquedup, bool indexUnchanged);
69 : : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
70 : : OffsetNumber *deletable, int ndeletable,
71 : : IndexTuple newitem, OffsetNumber minoff,
72 : : OffsetNumber maxoff);
73 : : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
74 : : int ndeletable, IndexTuple newitem,
75 : : int *nblocks);
76 : : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
77 : :
78 : : /*
79 : : * _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
80 : : *
81 : : * This routine is called by the public interface routine, btinsert.
82 : : * By here, itup is filled in, including the TID.
83 : : *
84 : : * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
85 : : * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
86 : : * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
87 : : * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
88 : : * don't actually insert.
89 : : *
90 : : * indexUnchanged executor hint indicates if itup is from an
91 : : * UPDATE that didn't logically change the indexed value, but
92 : : * must nevertheless have a new entry to point to a successor
93 : : * version.
94 : : *
95 : : * The result value is only significant for UNIQUE_CHECK_PARTIAL:
96 : : * it must be true if the entry is known unique, else false.
97 : : * (In the current implementation we'll also return true after a
98 : : * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
99 : : * that's just a coding artifact.)
100 : : */
101 : : bool
6654 tgl@sss.pgh.pa.us 102 :CBC 3358778 : _bt_doinsert(Relation rel, IndexTuple itup,
103 : : IndexUniqueCheck checkUnique, bool indexUnchanged,
104 : : Relation heapRel)
105 : : {
5373 106 : 3358778 : bool is_unique = false;
107 : : BTInsertStateData insertstate;
108 : : BTScanInsert itup_key;
109 : : BTStack stack;
1852 pg@bowt.ie 110 : 3358778 : bool checkingunique = (checkUnique != UNIQUE_CHECK_NO);
111 : :
112 : : /* we need an insertion scan key to do our search, so build one */
309 113 : 3358778 : itup_key = _bt_mkscankey(rel, itup);
114 : :
1818 115 [ + + ]: 3358778 : if (checkingunique)
116 : : {
117 [ + + ]: 2480078 : if (!itup_key->anynullkeys)
118 : : {
119 : : /* No (heapkeyspace) scantid until uniqueness established */
120 : 2469994 : itup_key->scantid = NULL;
121 : : }
122 : : else
123 : : {
124 : : /*
125 : : * Scan key for new tuple contains NULL key values. Bypass
126 : : * checkingunique steps. They are unnecessary because core code
127 : : * considers NULL unequal to every value, including NULL.
128 : : *
129 : : * This optimization avoids O(N^2) behavior within the
130 : : * _bt_findinsertloc() heapkeyspace path when a unique index has a
131 : : * large number of "duplicates" with NULL key values.
132 : : */
133 : 10084 : checkingunique = false;
134 : : /* Tuple is unique in the sense that core code cares about */
135 [ - + ]: 10084 : Assert(checkUnique != UNIQUE_CHECK_EXISTING);
136 : 10084 : is_unique = true;
137 : : }
138 : : }
139 : :
140 : : /*
141 : : * Fill in the BTInsertState working area, to track the current page and
142 : : * position within the page to insert on.
143 : : *
144 : : * Note that itemsz is passed down to lower level code that deals with
145 : : * inserting the item. It must be MAXALIGN()'d. This ensures that space
146 : : * accounting code consistently considers the alignment overhead that we
147 : : * expect PageAddItem() will add later. (Actually, index_form_tuple() is
148 : : * already conservative about alignment, but we don't rely on that from
149 : : * this distance. Besides, preserving the "true" tuple size in index
150 : : * tuple headers for the benefit of nbtsplitloc.c might happen someday.
151 : : * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
152 : : */
1852 153 : 3358778 : insertstate.itup = itup;
154 : 3358778 : insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
155 : 3358778 : insertstate.itup_key = itup_key;
156 : 3358778 : insertstate.bounds_valid = false;
157 : 3358778 : insertstate.buf = InvalidBuffer;
1509 158 : 3358778 : insertstate.postingoff = 0;
159 : :
1488 160 : 3358791 : search:
161 : :
162 : : /*
163 : : * Find and lock the leaf page that the tuple should be added to by
164 : : * searching from the root page. insertstate.buf will hold a buffer that
165 : : * is locked in exclusive mode afterwards.
166 : : */
379 andres@anarazel.de 167 : 3358791 : stack = _bt_search_insert(rel, heapRel, &insertstate);
168 : :
169 : : /*
170 : : * checkingunique inserts are not allowed to go ahead when two tuples with
171 : : * equal key attribute values would be visible to new MVCC snapshots once
172 : : * the xact commits. Check for conflicts in the locked page/buffer (if
173 : : * needed) here.
174 : : *
175 : : * It might be necessary to check a page to the right in _bt_check_unique,
176 : : * though that should be very rare. In practice the first page the value
177 : : * could be on (with scantid omitted) is almost always also the only page
178 : : * that a matching tuple might be found on. This is due to the behavior
179 : : * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
180 : : * only be allowed to cross a page boundary when there is no candidate
181 : : * leaf page split point that avoids it. Also, _bt_check_unique can use
182 : : * the leaf page high key to determine that there will be no duplicates on
183 : : * the right sibling without actually visiting it (it uses the high key in
184 : : * cases where the new item happens to belong at the far right of the leaf
185 : : * page).
186 : : *
187 : : * NOTE: obviously, _bt_check_unique can only detect keys that are already
188 : : * in the index; so it cannot defend against concurrent insertions of the
189 : : * same key. We protect against that by means of holding a write lock on
190 : : * the first page the value could be on, with omitted/-inf value for the
191 : : * implicit heap TID tiebreaker attribute. Any other would-be inserter of
192 : : * the same key must acquire a write lock on the same page, so only one
193 : : * would-be inserter can be making the check at one time. Furthermore,
194 : : * once we are past the check we hold write locks continuously until we
195 : : * have performed our insertion, so no later inserter can fail to see our
196 : : * insertion. (This requires some care in _bt_findinsertloc.)
197 : : *
198 : : * If we must wait for another xact, we release the lock while waiting,
199 : : * and then must perform a new search.
200 : : *
201 : : * For a partial uniqueness check, we don't wait for the other xact. Just
202 : : * let the tuple in and return false for possibly non-unique, or true for
203 : : * definitely unique.
204 : : */
1852 pg@bowt.ie 205 [ + + ]: 3358791 : if (checkingunique)
206 : : {
207 : : TransactionId xwait;
208 : : uint32 speculativeToken;
209 : :
210 : 2470007 : xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
211 : : &is_unique, &speculativeToken);
212 : :
1488 213 [ + + ]: 2469753 : if (unlikely(TransactionIdIsValid(xwait)))
214 : : {
215 : : /* Have to wait for the other guy ... */
1852 216 : 14 : _bt_relbuf(rel, insertstate.buf);
217 : 14 : insertstate.buf = InvalidBuffer;
218 : :
219 : : /*
220 : : * If it's a speculative insertion, wait for it to finish (ie. to
221 : : * go ahead with the insertion, or kill the tuple). Otherwise
222 : : * wait for the transaction to finish as usual.
223 : : */
3264 andres@anarazel.de 224 [ - + ]: 14 : if (speculativeToken)
3264 andres@anarazel.de 225 :UBC 0 : SpeculativeInsertionWait(xwait, speculativeToken);
226 : : else
3264 andres@anarazel.de 227 :CBC 14 : XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
228 : :
229 : : /* start over... */
2211 andrew@dunslane.net 230 [ + + ]: 13 : if (stack)
231 : 1 : _bt_freestack(stack);
1488 pg@bowt.ie 232 : 13 : goto search;
233 : : }
234 : :
235 : : /* Uniqueness is established -- restore heap tid as scantid */
1852 236 [ + - ]: 2469739 : if (itup_key->heapkeyspace)
237 : 2469739 : itup_key->scantid = &itup->t_tid;
238 : : }
239 : :
5373 tgl@sss.pgh.pa.us 240 [ + + ]: 3358523 : if (checkUnique != UNIQUE_CHECK_EXISTING)
241 : : {
242 : : OffsetNumber newitemoff;
243 : :
244 : : /*
245 : : * The only conflict predicate locking cares about for indexes is when
246 : : * an index tuple insert conflicts with an existing lock. We don't
247 : : * know the actual page we're going to insert on for sure just yet in
248 : : * checkingunique and !heapkeyspace cases, but it's okay to use the
249 : : * first page the value could be on (with scantid omitted) instead.
250 : : */
1538 tmunro@postgresql.or 251 : 3358496 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
252 : :
253 : : /*
254 : : * Do the insertion. Note that insertstate contains cached binary
255 : : * search bounds established within _bt_check_unique when insertion is
256 : : * checkingunique.
257 : : */
1852 pg@bowt.ie 258 : 3358493 : newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
259 : : indexUnchanged, stack, heapRel);
379 andres@anarazel.de 260 : 3358493 : _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
261 : : stack, itup, insertstate.itemsz, newitemoff,
262 : : insertstate.postingoff, false);
263 : : }
264 : : else
265 : : {
266 : : /* just release the buffer */
1852 pg@bowt.ie 267 : 27 : _bt_relbuf(rel, insertstate.buf);
268 : : }
269 : :
270 : : /* be tidy */
2211 andrew@dunslane.net 271 [ + + ]: 3358520 : if (stack)
272 : 2952976 : _bt_freestack(stack);
1852 pg@bowt.ie 273 : 3358520 : pfree(itup_key);
274 : :
5373 tgl@sss.pgh.pa.us 275 : 3358520 : return is_unique;
276 : : }
277 : :
278 : : /*
279 : : * _bt_search_insert() -- _bt_search() wrapper for inserts
280 : : *
281 : : * Search the tree for a particular scankey, or more precisely for the first
282 : : * leaf page it could be on. Try to make use of the fastpath optimization's
283 : : * rightmost leaf page cache before actually searching the tree from the root
284 : : * page, though.
285 : : *
286 : : * Return value is a stack of parent-page pointers (though see notes about
287 : : * fastpath optimization and page splits below). insertstate->buf is set to
288 : : * the address of the leaf-page buffer, which is write-locked and pinned in
289 : : * all cases (if necessary by creating a new empty root page for caller).
290 : : *
291 : : * The fastpath optimization avoids most of the work of searching the tree
292 : : * repeatedly when a single backend inserts successive new tuples on the
293 : : * rightmost leaf page of an index. A backend cache of the rightmost leaf
294 : : * page is maintained within _bt_insertonpg(), and used here. The cache is
295 : : * invalidated here when an insert of a non-pivot tuple must take place on a
296 : : * non-rightmost leaf page.
297 : : *
298 : : * The optimization helps with indexes on an auto-incremented field. It also
299 : : * helps with indexes on datetime columns, as well as indexes with lots of
300 : : * NULL values. (NULLs usually get inserted in the rightmost page for single
301 : : * column indexes, since they usually get treated as coming after everything
302 : : * else in the key space. Individual NULL tuples will generally be placed on
303 : : * the rightmost leaf page due to the influence of the heap TID column.)
304 : : *
305 : : * Note that we avoid applying the optimization when there is insufficient
306 : : * space on the rightmost page to fit caller's new item. This is necessary
307 : : * because we'll need to return a real descent stack when a page split is
308 : : * expected (actually, caller can cope with a leaf page split that uses a NULL
309 : : * stack, but that's very slow and so must be avoided). Note also that the
310 : : * fastpath optimization acquires the lock on the page conditionally as a way
311 : : * of reducing extra contention when there are concurrent insertions into the
312 : : * rightmost page (we give up if we'd have to wait for the lock). We assume
313 : : * that it isn't useful to apply the optimization when there is contention,
314 : : * since each per-backend cache won't stay valid for long.
315 : : */
316 : : static BTStack
379 andres@anarazel.de 317 : 3358791 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
318 : : {
1488 pg@bowt.ie 319 [ - + ]: 3358791 : Assert(insertstate->buf == InvalidBuffer);
320 [ - + ]: 3358791 : Assert(!insertstate->bounds_valid);
321 [ - + ]: 3358791 : Assert(insertstate->postingoff == 0);
322 : :
323 [ + - + + ]: 3358791 : if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
324 : : {
325 : : /* Simulate a _bt_getbuf() call with conditional locking */
326 [ + - ]: 34407 : insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
1363 327 [ + + ]: 34407 : if (_bt_conditionallockbuf(rel, insertstate->buf))
328 : : {
329 : : Page page;
330 : : BTPageOpaque opaque;
331 : :
1488 332 : 30956 : _bt_checkpage(rel, insertstate->buf);
333 : 30956 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 334 : 30956 : opaque = BTPageGetOpaque(page);
335 : :
336 : : /*
337 : : * Check if the page is still the rightmost leaf page and has
338 : : * enough free space to accommodate the new tuple. Also check
339 : : * that the insertion scan key is strictly greater than the first
340 : : * non-pivot tuple on the page. (Note that we expect itup_key's
341 : : * scantid to be unset when our caller is a checkingunique
342 : : * inserter.)
343 : : */
1244 pg@bowt.ie 344 [ + + ]: 30956 : if (P_RIGHTMOST(opaque) &&
345 [ + - ]: 30953 : P_ISLEAF(opaque) &&
346 [ + - ]: 30953 : !P_IGNORE(opaque) &&
1488 347 [ + + + - ]: 61715 : PageGetFreeSpace(page) > insertstate->itemsz &&
348 [ + + ]: 61524 : PageGetMaxOffsetNumber(page) >= P_HIKEY &&
349 : 30762 : _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
350 : : {
351 : : /*
352 : : * Caller can use the fastpath optimization because cached
353 : : * block is still rightmost leaf page, which can fit caller's
354 : : * new tuple without splitting. Keep block in local cache for
355 : : * next insert, and have caller use NULL stack.
356 : : *
357 : : * Note that _bt_insert_parent() has an assertion that catches
358 : : * leaf page splits that somehow follow from a fastpath insert
359 : : * (it should only be passed a NULL stack when it must deal
360 : : * with a concurrent root page split, and never because a NULL
361 : : * stack was returned here).
362 : : */
363 : 30745 : return NULL;
364 : : }
365 : :
366 : : /* Page unsuitable for caller, drop lock and pin */
367 : 211 : _bt_relbuf(rel, insertstate->buf);
368 : : }
369 : : else
370 : : {
371 : : /* Lock unavailable, drop pin */
372 : 3451 : ReleaseBuffer(insertstate->buf);
373 : : }
374 : :
375 : : /* Forget block, since cache doesn't appear to be useful */
376 : 3662 : RelationSetTargetBlock(rel, InvalidBlockNumber);
377 : : }
378 : :
379 : : /* Cannot use optimization -- descend tree, return proper descent stack */
379 andres@anarazel.de 380 : 3328046 : return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
381 : : BT_WRITE);
382 : : }
383 : :
384 : : /*
385 : : * _bt_check_unique() -- Check for violation of unique index constraint
386 : : *
387 : : * Returns InvalidTransactionId if there is no conflict, else an xact ID
388 : : * we must wait for to see if it commits a conflicting tuple. If an actual
389 : : * conflict is detected, no return --- just ereport(). If an xact ID is
390 : : * returned, and the conflicting tuple still has a speculative insertion in
391 : : * progress, *speculativeToken is set to non-zero, and the caller can wait for
392 : : * the verdict on the insertion using SpeculativeInsertionWait().
393 : : *
394 : : * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
395 : : * InvalidTransactionId because we don't want to wait. In this case we
396 : : * set *is_unique to false if there is a potential conflict, and the
397 : : * core code must redo the uniqueness check later.
398 : : *
399 : : * As a side-effect, sets state in insertstate that can later be used by
400 : : * _bt_findinsertloc() to reuse most of the binary search work we do
401 : : * here.
402 : : *
403 : : * This code treats NULLs as equal, unlike the default semantics for unique
404 : : * indexes. So do not call here when there are NULL values in scan key and
405 : : * the index uses the default NULLS DISTINCT mode.
406 : : */
407 : : static TransactionId
1852 pg@bowt.ie 408 : 2470007 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
409 : : IndexUniqueCheck checkUnique, bool *is_unique,
410 : : uint32 *speculativeToken)
411 : : {
412 : 2470007 : IndexTuple itup = insertstate->itup;
1401 413 : 2470007 : IndexTuple curitup = NULL;
1102 414 : 2470007 : ItemId curitemid = NULL;
1852 415 : 2470007 : BTScanInsert itup_key = insertstate->itup_key;
416 : : SnapshotData SnapshotDirty;
417 : : OffsetNumber offset;
418 : : OffsetNumber maxoff;
419 : : Page page;
420 : : BTPageOpaque opaque;
8668 tgl@sss.pgh.pa.us 421 : 2470007 : Buffer nbuf = InvalidBuffer;
5373 422 : 2470007 : bool found = false;
1509 pg@bowt.ie 423 : 2470007 : bool inposting = false;
424 : 2470007 : bool prevalldead = true;
425 : 2470007 : int curposti = 0;
426 : :
427 : : /* Assume unique until we find a duplicate */
5373 tgl@sss.pgh.pa.us 428 : 2470007 : *is_unique = true;
429 : :
6230 430 : 2470007 : InitDirtySnapshot(SnapshotDirty);
431 : :
1852 pg@bowt.ie 432 : 2470007 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 433 : 2470007 : opaque = BTPageGetOpaque(page);
8668 tgl@sss.pgh.pa.us 434 : 2470007 : maxoff = PageGetMaxOffsetNumber(page);
435 : :
436 : : /*
437 : : * Find the first tuple with the same key.
438 : : *
439 : : * This also saves the binary search bounds in insertstate. We use them
440 : : * in the fastpath below, but also in the _bt_findinsertloc() call later.
441 : : */
1837 pg@bowt.ie 442 [ - + ]: 2470007 : Assert(!insertstate->bounds_valid);
1852 443 : 2470007 : offset = _bt_binsrch_insert(rel, insertstate);
444 : :
445 : : /*
446 : : * Scan over all equal tuples, looking for live conflicts.
447 : : */
448 [ + + - + ]: 2470007 : Assert(!insertstate->bounds_valid || insertstate->low == offset);
1818 449 [ - + ]: 2470007 : Assert(!itup_key->anynullkeys);
1852 450 [ + - ]: 2470007 : Assert(itup_key->scantid == NULL);
451 : : for (;;)
452 : : {
453 : : /*
454 : : * Each iteration of the loop processes one heap TID, not one index
455 : : * tuple. Current offset number for page isn't usually advanced on
456 : : * iterations that process heap TIDs from posting list tuples.
457 : : *
458 : : * "inposting" state is set when _inside_ a posting list --- not when
459 : : * we're at the start (or end) of a posting list. We advance curposti
460 : : * at the end of the iteration when inside a posting list tuple. In
461 : : * general, every loop iteration either advances the page offset or
462 : : * advances curposti --- an iteration that handles the rightmost/max
463 : : * heap TID in a posting list finally advances the page offset (and
464 : : * unsets "inposting").
465 : : *
466 : : * Make sure the offset points to an actual index tuple before trying
467 : : * to examine it...
468 : : */
8668 tgl@sss.pgh.pa.us 469 [ + + ]: 8417300 : if (offset <= maxoff)
470 : : {
471 : : /*
472 : : * Fastpath: In most cases, we can use cached search bounds to
473 : : * limit our consideration to items that are definitely
474 : : * duplicates. This fastpath doesn't apply when the original page
475 : : * is empty, or when initial offset is past the end of the
476 : : * original page, which may indicate that we need to examine a
477 : : * second or subsequent page.
478 : : *
479 : : * Note that this optimization allows us to avoid calling
480 : : * _bt_compare() directly when there are no duplicates, as long as
481 : : * the offset where the key will go is not at the end of the page.
482 : : */
1852 pg@bowt.ie 483 [ + + + + ]: 6997013 : if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
484 : : {
485 [ - + ]: 944413 : Assert(insertstate->bounds_valid);
486 [ + + - + ]: 944413 : Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
487 [ - + ]: 944413 : Assert(insertstate->low <= insertstate->stricthigh);
1818 488 [ - + ]: 944413 : Assert(_bt_compare(rel, itup_key, page, offset) < 0);
1852 489 : 944413 : break;
490 : : }
491 : :
492 : : /*
493 : : * We can skip items that are already marked killed.
494 : : *
495 : : * In the presence of heavy update activity an index may contain
496 : : * many killed items with the same key; running _bt_compare() on
497 : : * each killed item gets expensive. Just advance over killed
498 : : * items as quickly as we can. We only apply _bt_compare() when
499 : : * we get to a non-killed item. We could reuse the bounds to
500 : : * avoid _bt_compare() calls for known equal tuples, but it
501 : : * doesn't seem worth it.
502 : : */
1509 503 [ + + ]: 6052600 : if (!inposting)
504 : 3793335 : curitemid = PageGetItemId(page, offset);
505 [ + + + + ]: 6052600 : if (inposting || !ItemIdIsDead(curitemid))
506 : : {
507 : : ItemPointerData htid;
508 : 5797545 : bool all_dead = false;
509 : :
510 [ + + ]: 5797545 : if (!inposting)
511 : : {
512 : : /* Plain tuple, or first TID in posting list tuple */
513 [ + + ]: 3538280 : if (_bt_compare(rel, itup_key, page, offset) != 0)
514 : 93002 : break; /* we're past all the equal tuples */
515 : :
516 : : /* Advanced curitup */
517 : 3445278 : curitup = (IndexTuple) PageGetItem(page, curitemid);
518 [ - + ]: 3445278 : Assert(!BTreeTupleIsPivot(curitup));
519 : : }
520 : :
521 : : /* okay, we gotta fetch the heap tuple using htid ... */
522 [ + + ]: 5704543 : if (!BTreeTupleIsPosting(curitup))
523 : : {
524 : : /* ... htid is from simple non-pivot tuple */
525 [ - + ]: 3421767 : Assert(!inposting);
526 : 3421767 : htid = curitup->t_tid;
527 : : }
528 [ + + ]: 2282776 : else if (!inposting)
529 : : {
530 : : /* ... htid is first TID in new posting list */
531 : 23511 : inposting = true;
532 : 23511 : prevalldead = true;
533 : 23511 : curposti = 0;
534 : 23511 : htid = *BTreeTupleGetPostingN(curitup, 0);
535 : : }
536 : : else
537 : : {
538 : : /* ... htid is second or subsequent TID in posting list */
539 [ - + ]: 2259265 : Assert(curposti > 0);
540 : 2259265 : htid = *BTreeTupleGetPostingN(curitup, curposti);
541 : : }
542 : :
543 : : /*
544 : : * If we are doing a recheck, we expect to find the tuple we
545 : : * are rechecking. It's not a duplicate, but we have to keep
546 : : * scanning.
547 : : */
5373 tgl@sss.pgh.pa.us 548 [ + + + + ]: 5704665 : if (checkUnique == UNIQUE_CHECK_EXISTING &&
549 : 122 : ItemPointerCompare(&htid, &itup->t_tid) == 0)
550 : : {
551 : 27 : found = true;
552 : : }
553 : :
554 : : /*
555 : : * Check if there's any table tuples for this index entry
556 : : * satisfying SnapshotDirty. This is necessary because for AMs
557 : : * with optimizations like heap's HOT, we have just a single
558 : : * index entry for the entire chain.
559 : : */
1847 andres@anarazel.de 560 [ + + ]: 5704516 : else if (table_index_fetch_tuple_check(heapRel, &htid,
561 : : &SnapshotDirty,
562 : : &all_dead))
563 : : {
564 : : TransactionId xwait;
565 : :
566 : : /*
567 : : * It is a duplicate. If we are only doing a partial
568 : : * check, then don't bother checking if the tuple is being
569 : : * updated in another transaction. Just return the fact
570 : : * that it is a potential conflict and leave the full
571 : : * check till later. Don't invalidate binary search
572 : : * bounds.
573 : : */
5373 tgl@sss.pgh.pa.us 574 [ + + ]: 322 : if (checkUnique == UNIQUE_CHECK_PARTIAL)
575 : : {
576 [ - + ]: 54 : if (nbuf != InvalidBuffer)
5373 tgl@sss.pgh.pa.us 577 :UBC 0 : _bt_relbuf(rel, nbuf);
5373 tgl@sss.pgh.pa.us 578 :CBC 54 : *is_unique = false;
579 : 68 : return InvalidTransactionId;
580 : : }
581 : :
582 : : /*
583 : : * If this tuple is being updated by other transaction
584 : : * then we have to wait for its commit/abort.
585 : : */
586 : 536 : xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
587 [ + + ]: 268 : SnapshotDirty.xmin : SnapshotDirty.xmax;
588 : :
7996 589 [ + + ]: 268 : if (TransactionIdIsValid(xwait))
590 : : {
591 [ - + ]: 14 : if (nbuf != InvalidBuffer)
7996 tgl@sss.pgh.pa.us 592 :UBC 0 : _bt_relbuf(rel, nbuf);
593 : : /* Tell _bt_doinsert to wait... */
3264 andres@anarazel.de 594 :CBC 14 : *speculativeToken = SnapshotDirty.speculativeToken;
595 : : /* Caller releases lock on buf immediately */
1837 pg@bowt.ie 596 : 14 : insertstate->bounds_valid = false;
7996 tgl@sss.pgh.pa.us 597 : 14 : return xwait;
598 : : }
599 : :
600 : : /*
601 : : * Otherwise we have a definite conflict. But before
602 : : * complaining, look to see if the tuple we want to insert
603 : : * is itself now committed dead --- if so, don't complain.
604 : : * This is a waste of time in normal scenarios but we must
605 : : * do it to support CREATE INDEX CONCURRENTLY.
606 : : *
607 : : * We must follow HOT-chains here because during
608 : : * concurrent index build, we insert the root TID though
609 : : * the actual tuple may be somewhere in the HOT-chain.
610 : : * While following the chain we might not stop at the
611 : : * exact tuple which triggered the insert, but that's OK
612 : : * because if we find a live tuple anywhere in this chain,
613 : : * we have a unique key conflict. The other live tuple is
614 : : * not part of this chain because it had a different index
615 : : * entry.
616 : : */
1389 pg@bowt.ie 617 : 254 : htid = itup->t_tid;
618 [ - + ]: 254 : if (table_index_fetch_tuple_check(heapRel, &htid,
619 : : SnapshotSelf, NULL))
620 : : {
621 : : /* Normal case --- it's still live */
622 : : }
623 : : else
624 : : {
625 : : /*
626 : : * It's been deleted, so no error, and no need to
627 : : * continue searching
628 : : */
6442 tgl@sss.pgh.pa.us 629 :UBC 0 : break;
630 : : }
631 : :
632 : : /*
633 : : * Check for a conflict-in as we would if we were going to
634 : : * write to this page. We aren't actually going to write,
635 : : * but we want a chance to report SSI conflicts that would
636 : : * otherwise be masked by this unique constraint
637 : : * violation.
638 : : */
1538 tmunro@postgresql.or 639 :CBC 254 : CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
640 : :
641 : : /*
642 : : * This is a definite conflict. Break the tuple down into
643 : : * datums and report the error. But first, make sure we
644 : : * release the buffer locks we're holding ---
645 : : * BuildIndexValueDescription could make catalog accesses,
646 : : * which in the worst case might touch this same index and
647 : : * cause deadlocks.
648 : : */
5370 tgl@sss.pgh.pa.us 649 [ - + ]: 250 : if (nbuf != InvalidBuffer)
5370 tgl@sss.pgh.pa.us 650 :UBC 0 : _bt_relbuf(rel, nbuf);
1852 pg@bowt.ie 651 :CBC 250 : _bt_relbuf(rel, insertstate->buf);
652 : 250 : insertstate->buf = InvalidBuffer;
1837 653 : 250 : insertstate->bounds_valid = false;
654 : :
655 : : {
656 : : Datum values[INDEX_MAX_KEYS];
657 : : bool isnull[INDEX_MAX_KEYS];
658 : : char *key_desc;
659 : :
5370 tgl@sss.pgh.pa.us 660 : 250 : index_deform_tuple(itup, RelationGetDescr(rel),
661 : : values, isnull);
662 : :
3380 sfrost@snowman.net 663 : 250 : key_desc = BuildIndexValueDescription(rel, values,
664 : : isnull);
665 : :
5370 tgl@sss.pgh.pa.us 666 [ + - + + ]: 250 : ereport(ERROR,
667 : : (errcode(ERRCODE_UNIQUE_VIOLATION),
668 : : errmsg("duplicate key value violates unique constraint \"%s\"",
669 : : RelationGetRelationName(rel)),
670 : : key_desc ? errdetail("Key %s already exists.",
671 : : key_desc) : 0,
672 : : errtableconstraint(heapRel,
673 : : RelationGetRelationName(rel))));
674 : : }
675 : : }
1509 pg@bowt.ie 676 [ + + + + : 5704194 : else if (all_dead && (!inposting ||
+ + ]
677 : 18349 : (prevalldead &&
678 [ + + ]: 18349 : curposti == BTreeTupleGetNPosting(curitup) - 1)))
679 : : {
680 : : /*
681 : : * The conflicting tuple (or all HOT chains pointed to by
682 : : * all posting list TIDs) is dead to everyone, so mark the
683 : : * index entry killed.
684 : : */
6051 tgl@sss.pgh.pa.us 685 : 50496 : ItemIdMarkDead(curitemid);
686 : 50496 : opaque->btpo_flags |= BTP_HAS_GARBAGE;
687 : :
688 : : /*
689 : : * Mark buffer with a dirty hint, since state is not
690 : : * crucial. Be sure to mark the proper buffer dirty.
691 : : */
692 [ + + ]: 50496 : if (nbuf != InvalidBuffer)
3954 jdavis@postgresql.or 693 : 3 : MarkBufferDirtyHint(nbuf, true);
694 : : else
1852 pg@bowt.ie 695 : 50493 : MarkBufferDirtyHint(insertstate->buf, true);
696 : : }
697 : :
698 : : /*
699 : : * Remember if posting list tuple has even a single HOT chain
700 : : * whose members are not all dead
701 : : */
1509 702 [ + + + + ]: 5704221 : if (!all_dead && inposting)
703 : 2264422 : prevalldead = false;
704 : : }
705 : : }
706 : :
707 [ + + + + ]: 7379563 : if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
708 : : {
709 : : /* Advance to next TID in same posting list */
710 : 2259265 : curposti++;
711 : 2259265 : continue;
712 : : }
713 [ + + ]: 5120298 : else if (offset < maxoff)
714 : : {
715 : : /* Advance to next tuple */
716 : 3683006 : curposti = 0;
717 : 3683006 : inposting = false;
8668 tgl@sss.pgh.pa.us 718 : 3683006 : offset = OffsetNumberNext(offset);
719 : : }
720 : : else
721 : : {
722 : : int highkeycmp;
723 : :
724 : : /* If scankey == hikey we gotta check the next page too */
725 [ + + ]: 1437292 : if (P_RIGHTMOST(opaque))
726 : 1364235 : break;
1852 pg@bowt.ie 727 : 73057 : highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
728 [ - + ]: 73057 : Assert(highkeycmp <= 0);
729 [ + + ]: 73057 : if (highkeycmp != 0)
8668 tgl@sss.pgh.pa.us 730 : 68035 : break;
731 : : /* Advance to next non-dead page --- there must be one */
732 : : for (;;)
7722 tgl@sss.pgh.pa.us 733 :UBC 0 : {
1509 pg@bowt.ie 734 :CBC 5022 : BlockNumber nblkno = opaque->btpo_next;
735 : :
7298 tgl@sss.pgh.pa.us 736 : 5022 : nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
2916 kgrittn@postgresql.o 737 : 5022 : page = BufferGetPage(nbuf);
744 michael@paquier.xyz 738 : 5022 : opaque = BTPageGetOpaque(page);
7722 tgl@sss.pgh.pa.us 739 [ + - ]: 5022 : if (!P_IGNORE(opaque))
740 : 5022 : break;
7722 tgl@sss.pgh.pa.us 741 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
5949 742 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
743 : : RelationGetRelationName(rel));
744 : : }
745 : : /* Will also advance to next tuple */
1509 pg@bowt.ie 746 :CBC 5022 : curposti = 0;
747 : 5022 : inposting = false;
8668 tgl@sss.pgh.pa.us 748 : 5022 : maxoff = PageGetMaxOffsetNumber(page);
749 [ + + ]: 5022 : offset = P_FIRSTDATAKEY(opaque);
750 : : /* Don't invalidate binary search bounds */
751 : : }
752 : : }
753 : :
754 : : /*
755 : : * If we are doing a recheck then we should have found the tuple we are
756 : : * checking. Otherwise there's something very wrong --- probably, the
757 : : * index is on a non-immutable expression.
758 : : */
5373 759 [ + + - + ]: 2469685 : if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
5373 tgl@sss.pgh.pa.us 760 [ # # ]:UBC 0 : ereport(ERROR,
761 : : (errcode(ERRCODE_INTERNAL_ERROR),
762 : : errmsg("failed to re-find tuple within index \"%s\"",
763 : : RelationGetRelationName(rel)),
764 : : errhint("This may be because of a non-immutable index expression."),
765 : : errtableconstraint(heapRel,
766 : : RelationGetRelationName(rel))));
767 : :
8668 tgl@sss.pgh.pa.us 768 [ + + ]:CBC 2469685 : if (nbuf != InvalidBuffer)
8309 769 : 2880 : _bt_relbuf(rel, nbuf);
770 : :
8270 771 : 2469685 : return InvalidTransactionId;
772 : : }
773 : :
774 : :
775 : : /*
776 : : * _bt_findinsertloc() -- Finds an insert location for a tuple
777 : : *
778 : : * On entry, insertstate buffer contains the page the new tuple belongs
779 : : * on. It is exclusive-locked and pinned by the caller.
780 : : *
781 : : * If 'checkingunique' is true, the buffer on entry is the first page
782 : : * that contains duplicates of the new key. If there are duplicates on
783 : : * multiple pages, the correct insertion position might be some page to
784 : : * the right, rather than the first page. In that case, this function
785 : : * moves right to the correct target page.
786 : : *
787 : : * (In a !heapkeyspace index, there can be multiple pages with the same
788 : : * high key, where the new tuple could legitimately be placed on. In
789 : : * that case, the caller passes the first page containing duplicates,
790 : : * just like when checkingunique=true. If that page doesn't have enough
791 : : * room for the new tuple, this function moves right, trying to find a
792 : : * legal page that does.)
793 : : *
794 : : * If 'indexUnchanged' is true, this is for an UPDATE that didn't
795 : : * logically change the indexed value, but must nevertheless have a new
796 : : * entry to point to a successor version. This hint from the executor
797 : : * will influence our behavior when the page might have to be split and
798 : : * we must consider our options. Bottom-up index deletion can avoid
799 : : * pathological version-driven page splits, but we only want to go to the
800 : : * trouble of trying it when we already have moderate confidence that
801 : : * it's appropriate. The hint should not significantly affect our
802 : : * behavior over time unless practically all inserts on to the leaf page
803 : : * get the hint.
804 : : *
805 : : * On exit, insertstate buffer contains the chosen insertion page, and
806 : : * the offset within that page is returned. If _bt_findinsertloc needed
807 : : * to move right, the lock and pin on the original page are released, and
808 : : * the new buffer is exclusively locked and pinned instead.
809 : : *
810 : : * If insertstate contains cached binary search bounds, we will take
811 : : * advantage of them. This avoids repeating comparisons that we made in
812 : : * _bt_check_unique() already.
813 : : */
814 : : static OffsetNumber
6252 bruce@momjian.us 815 : 3358493 : _bt_findinsertloc(Relation rel,
816 : : BTInsertState insertstate,
817 : : bool checkingunique,
818 : : bool indexUnchanged,
819 : : BTStack stack,
820 : : Relation heapRel)
821 : : {
1852 pg@bowt.ie 822 : 3358493 : BTScanInsert itup_key = insertstate->itup_key;
823 : 3358493 : Page page = BufferGetPage(insertstate->buf);
824 : : BTPageOpaque opaque;
825 : : OffsetNumber newitemoff;
826 : :
744 michael@paquier.xyz 827 : 3358493 : opaque = BTPageGetOpaque(page);
828 : :
829 : : /* Check 1/3 of a page restriction */
1852 pg@bowt.ie 830 [ - + ]: 3358493 : if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
1852 pg@bowt.ie 831 :UBC 0 : _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
832 : : insertstate->itup);
833 : :
1244 pg@bowt.ie 834 [ + - - + ]:CBC 3358493 : Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
1852 835 [ + + - + ]: 3358493 : Assert(!insertstate->bounds_valid || checkingunique);
836 [ + - - + ]: 3358493 : Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
837 [ - + - - ]: 3358493 : Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
1509 838 [ + + - + ]: 3358493 : Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
839 : :
1852 840 [ + - ]: 3358493 : if (itup_key->heapkeyspace)
841 : : {
842 : : /* Keep track of whether checkingunique duplicate seen */
1187 843 : 3358493 : bool uniquedup = indexUnchanged;
844 : :
845 : : /*
846 : : * If we're inserting into a unique index, we may have to walk right
847 : : * through leaf pages to find the one leaf page that we must insert on
848 : : * to.
849 : : *
850 : : * This is needed for checkingunique callers because a scantid was not
851 : : * used when we called _bt_search(). scantid can only be set after
852 : : * _bt_check_unique() has checked for duplicates. The buffer
853 : : * initially stored in insertstate->buf has the page where the first
854 : : * duplicate key might be found, which isn't always the page that new
855 : : * tuple belongs on. The heap TID attribute for new tuple (scantid)
856 : : * could force us to insert on a sibling page, though that should be
857 : : * very rare in practice.
858 : : */
1852 859 [ + + ]: 3358493 : if (checkingunique)
860 : : {
1509 861 [ + + ]: 2469709 : if (insertstate->low < insertstate->stricthigh)
862 : : {
863 : : /* Encountered a duplicate in _bt_check_unique() */
864 [ - + ]: 202619 : Assert(insertstate->bounds_valid);
865 : 202619 : uniquedup = true;
866 : : }
867 : :
868 : : for (;;)
869 : : {
870 : : /*
871 : : * Does the new tuple belong on this page?
872 : : *
873 : : * The earlier _bt_check_unique() call may well have
874 : : * established a strict upper bound on the offset for the new
875 : : * item. If it's not the last item of the page (i.e. if there
876 : : * is at least one tuple on the page that goes after the tuple
877 : : * we're inserting) then we know that the tuple belongs on
878 : : * this page. We can skip the high key check.
879 : : */
1852 880 [ + + ]: 2474731 : if (insertstate->bounds_valid &&
881 [ + - + + ]: 4929538 : insertstate->low <= insertstate->stricthigh &&
882 : 2464769 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
883 : 1025868 : break;
884 : :
885 : : /* Test '<=', not '!=', since scantid is set now */
1244 886 [ + + + + ]: 1528545 : if (P_RIGHTMOST(opaque) ||
1852 887 : 79682 : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
888 : : break;
889 : :
379 andres@anarazel.de 890 : 5022 : _bt_stepright(rel, heapRel, insertstate, stack);
891 : : /* Update local state after stepping right */
1852 pg@bowt.ie 892 : 5022 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 893 : 5022 : opaque = BTPageGetOpaque(page);
894 : : /* Assume duplicates (if checkingunique) */
1509 pg@bowt.ie 895 : 5022 : uniquedup = true;
896 : : }
897 : : }
898 : :
899 : : /*
900 : : * If the target page cannot fit newitem, try to avoid splitting the
901 : : * page on insert by performing deletion or deduplication now
902 : : */
903 [ + + ]: 3358493 : if (PageGetFreeSpace(page) < insertstate->itemsz)
1244 904 : 22756 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
905 : : checkingunique, uniquedup,
906 : : indexUnchanged);
907 : : }
908 : : else
909 : : {
910 : : /*----------
911 : : * This is a !heapkeyspace (version 2 or 3) index. The current page
912 : : * is the first page that we could insert the new tuple to, but there
913 : : * may be other pages to the right that we could opt to use instead.
914 : : *
915 : : * If the new key is equal to one or more existing keys, we can
916 : : * legitimately place it anywhere in the series of equal keys. In
917 : : * fact, if the new key is equal to the page's "high key" we can place
918 : : * it on the next page. If it is equal to the high key, and there's
919 : : * not room to insert the new tuple on the current page without
920 : : * splitting, then we move right hoping to find more free space and
921 : : * avoid a split.
922 : : *
923 : : * Keep scanning right until we
924 : : * (a) find a page with enough free space,
925 : : * (b) reach the last page where the tuple can legally go, or
926 : : * (c) get tired of searching.
927 : : * (c) is not flippant; it is important because if there are many
928 : : * pages' worth of equal keys, it's better to split one of the early
929 : : * pages than to scan all the way to the end of the run of equal keys
930 : : * on every insert. We implement "get tired" as a random choice,
931 : : * since stopping after scanning a fixed number of pages wouldn't work
932 : : * well (we'd never reach the right-hand side of previously split
933 : : * pages). The probability of moving right is set at 0.99, which may
934 : : * seem too high to change the behavior much, but it does an excellent
935 : : * job of preventing O(N^2) behavior with many equal keys.
936 : : *----------
937 : : */
1852 pg@bowt.ie 938 [ # # ]:UBC 0 : while (PageGetFreeSpace(page) < insertstate->itemsz)
939 : : {
940 : : /*
941 : : * Before considering moving right, see if we can obtain enough
942 : : * space by erasing LP_DEAD items
943 : : */
1244 944 [ # # ]: 0 : if (P_HAS_GARBAGE(opaque))
945 : : {
946 : : /* Perform simple deletion */
947 : 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
948 : : false, false, false);
949 : :
1852 950 [ # # ]: 0 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
951 : 0 : break; /* OK, now we have enough space */
952 : : }
953 : :
954 : : /*
955 : : * Nope, so check conditions (b) and (c) enumerated above
956 : : *
957 : : * The earlier _bt_check_unique() call may well have established a
958 : : * strict upper bound on the offset for the new item. If it's not
959 : : * the last item of the page (i.e. if there is at least one tuple
960 : : * on the page that's greater than the tuple we're inserting to)
961 : : * then we know that the tuple belongs on this page. We can skip
962 : : * the high key check.
963 : : */
964 [ # # ]: 0 : if (insertstate->bounds_valid &&
965 [ # # # # ]: 0 : insertstate->low <= insertstate->stricthigh &&
966 : 0 : insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
967 : 0 : break;
968 : :
1244 969 [ # # # # ]: 0 : if (P_RIGHTMOST(opaque) ||
1852 970 [ # # ]: 0 : _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
868 tgl@sss.pgh.pa.us 971 : 0 : pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
972 : : break;
973 : :
379 andres@anarazel.de 974 : 0 : _bt_stepright(rel, heapRel, insertstate, stack);
975 : : /* Update local state after stepping right */
1852 pg@bowt.ie 976 : 0 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 977 : 0 : opaque = BTPageGetOpaque(page);
978 : : }
979 : : }
980 : :
981 : : /*
982 : : * We should now be on the correct page. Find the offset within the page
983 : : * for the new tuple. (Possibly reusing earlier search bounds.)
984 : : */
1244 pg@bowt.ie 985 [ + + - + ]:CBC 3358493 : Assert(P_RIGHTMOST(opaque) ||
986 : : _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
987 : :
1509 988 : 3358493 : newitemoff = _bt_binsrch_insert(rel, insertstate);
989 : :
990 [ - + ]: 3358493 : if (insertstate->postingoff == -1)
991 : : {
992 : : /*
993 : : * There is an overlapping posting list tuple with its LP_DEAD bit
994 : : * set. We don't want to unnecessarily unset its LP_DEAD bit while
995 : : * performing a posting list split, so perform simple index tuple
996 : : * deletion early.
997 : : */
1244 pg@bowt.ie 998 :UBC 0 : _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
999 : : false, false, false);
1000 : :
1001 : : /*
1002 : : * Do new binary search. New insert location cannot overlap with any
1003 : : * posting list now.
1004 : : */
1005 [ # # ]: 0 : Assert(!insertstate->bounds_valid);
1509 1006 : 0 : insertstate->postingoff = 0;
1007 : 0 : newitemoff = _bt_binsrch_insert(rel, insertstate);
1008 [ # # ]: 0 : Assert(insertstate->postingoff == 0);
1009 : : }
1010 : :
1509 pg@bowt.ie 1011 :CBC 3358493 : return newitemoff;
1012 : : }
1013 : :
1014 : : /*
1015 : : * Step right to next non-dead page, during insertion.
1016 : : *
1017 : : * This is a bit more complicated than moving right in a search. We must
1018 : : * write-lock the target page before releasing write lock on current page;
1019 : : * else someone else's _bt_check_unique scan could fail to see our insertion.
1020 : : * Write locks on intermediate dead pages won't do because we don't know when
1021 : : * they will get de-linked from the tree.
1022 : : *
1023 : : * This is more aggressive than it needs to be for non-unique !heapkeyspace
1024 : : * indexes.
1025 : : */
1026 : : static void
309 1027 : 5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1028 : : BTStack stack)
1029 : : {
1030 : : Page page;
1031 : : BTPageOpaque opaque;
1032 : : Buffer rbuf;
1033 : : BlockNumber rblkno;
1034 : :
1035 [ - + ]: 5022 : Assert(heaprel != NULL);
1852 1036 : 5022 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 1037 : 5022 : opaque = BTPageGetOpaque(page);
1038 : :
1852 pg@bowt.ie 1039 : 5022 : rbuf = InvalidBuffer;
1244 1040 : 5022 : rblkno = opaque->btpo_next;
1041 : : for (;;)
1042 : : {
1852 pg@bowt.ie 1043 :UBC 0 : rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1852 pg@bowt.ie 1044 :CBC 5022 : page = BufferGetPage(rbuf);
744 michael@paquier.xyz 1045 : 5022 : opaque = BTPageGetOpaque(page);
1046 : :
1047 : : /*
1048 : : * If this page was incompletely split, finish the split now. We do
1049 : : * this while holding a lock on the left sibling, which is not good
1050 : : * because finishing the split could be a fairly lengthy operation.
1051 : : * But this should happen very seldom.
1052 : : */
1244 pg@bowt.ie 1053 [ - + ]: 5022 : if (P_INCOMPLETE_SPLIT(opaque))
1054 : : {
379 andres@anarazel.de 1055 :UBC 0 : _bt_finish_split(rel, heaprel, rbuf, stack);
1852 pg@bowt.ie 1056 : 0 : rbuf = InvalidBuffer;
1057 : 0 : continue;
1058 : : }
1059 : :
1244 pg@bowt.ie 1060 [ + - ]:CBC 5022 : if (!P_IGNORE(opaque))
1852 1061 : 5022 : break;
1244 pg@bowt.ie 1062 [ # # ]:UBC 0 : if (P_RIGHTMOST(opaque))
1852 1063 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
1064 : : RelationGetRelationName(rel));
1065 : :
1244 1066 : 0 : rblkno = opaque->btpo_next;
1067 : : }
1068 : : /* rbuf locked; unlock buf, update state for caller */
1852 pg@bowt.ie 1069 :CBC 5022 : _bt_relbuf(rel, insertstate->buf);
1070 : 5022 : insertstate->buf = rbuf;
1071 : 5022 : insertstate->bounds_valid = false;
6252 bruce@momjian.us 1072 : 5022 : }
1073 : :
1074 : : /*----------
1075 : : * _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1076 : : *
1077 : : * This recursive procedure does the following things:
1078 : : *
1079 : : * + if postingoff != 0, splits existing posting list tuple
1080 : : * (since it overlaps with new 'itup' tuple).
1081 : : * + if necessary, splits the target page, using 'itup_key' for
1082 : : * suffix truncation on leaf pages (caller passes NULL for
1083 : : * non-leaf pages).
1084 : : * + inserts the new tuple (might be split from posting list).
1085 : : * + if the page was split, pops the parent stack, and finds the
1086 : : * right place to insert the new child pointer (by walking
1087 : : * right using information stored in the parent stack).
1088 : : * + invokes itself with the appropriate tuple for the right
1089 : : * child page on the parent.
1090 : : * + updates the metapage if a true root or fast root is split.
1091 : : *
1092 : : * On entry, we must have the correct buffer in which to do the
1093 : : * insertion, and the buffer must be pinned and write-locked. On return,
1094 : : * we will have dropped both the pin and the lock on the buffer.
1095 : : *
1096 : : * This routine only performs retail tuple insertions. 'itup' should
1097 : : * always be either a non-highkey leaf item, or a downlink (new high
1098 : : * key items are created indirectly, when a page is split). When
1099 : : * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1100 : : * we're inserting the downlink for. This function will clear the
1101 : : * INCOMPLETE_SPLIT flag on it, and release the buffer.
1102 : : *----------
1103 : : */
1104 : : static void
1105 : 3368588 : _bt_insertonpg(Relation rel,
1106 : : Relation heaprel,
1107 : : BTScanInsert itup_key,
1108 : : Buffer buf,
1109 : : Buffer cbuf,
1110 : : BTStack stack,
1111 : : IndexTuple itup,
1112 : : Size itemsz,
1113 : : OffsetNumber newitemoff,
1114 : : int postingoff,
1115 : : bool split_only_page)
1116 : : {
1117 : : Page page;
1118 : : BTPageOpaque opaque;
1119 : : bool isleaf,
1120 : : isroot,
1121 : : isrightmost,
1122 : : isonly;
1509 pg@bowt.ie 1123 : 3368588 : IndexTuple oposting = NULL;
1124 : 3368588 : IndexTuple origitup = NULL;
1125 : 3368588 : IndexTuple nposting = NULL;
1126 : :
2916 kgrittn@postgresql.o 1127 : 3368588 : page = BufferGetPage(buf);
744 michael@paquier.xyz 1128 : 3368588 : opaque = BTPageGetOpaque(page);
1244 pg@bowt.ie 1129 : 3368588 : isleaf = P_ISLEAF(opaque);
1130 : 3368588 : isroot = P_ISROOT(opaque);
1131 : 3368588 : isrightmost = P_RIGHTMOST(opaque);
1132 [ + + + + ]: 3368588 : isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1133 : :
1134 : : /* child buffer must be given iff inserting on an internal page */
1135 [ - + ]: 3368588 : Assert(isleaf == !BufferIsValid(cbuf));
1136 : : /* tuple must have appropriate number of attributes */
1137 [ + + - + : 3368588 : Assert(!isleaf ||
- - ]
1138 : : BTreeTupleGetNAtts(itup, rel) ==
1139 : : IndexRelationGetNumberOfAttributes(rel));
1140 [ + + + - : 3368588 : Assert(isleaf ||
- + ]
1141 : : BTreeTupleGetNAtts(itup, rel) <=
1142 : : IndexRelationGetNumberOfKeyAttributes(rel));
1509 1143 [ - + ]: 3368588 : Assert(!BTreeTupleIsPosting(itup));
1490 1144 [ - + ]: 3368588 : Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1145 : : /* Caller must always finish incomplete split for us */
1244 1146 [ - + ]: 3368588 : Assert(!P_INCOMPLETE_SPLIT(opaque));
1147 : :
1148 : : /*
1149 : : * Every internal page should have exactly one negative infinity item at
1150 : : * all times. Only _bt_split() and _bt_newlevel() should add items that
1151 : : * become negative infinity items through truncation, since they're the
1152 : : * only routines that allocate new internal pages.
1153 : : */
1154 [ + + + + : 3368588 : Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
- + ]
1155 : :
1156 : : /*
1157 : : * Do we need to split an existing posting list item?
1158 : : */
1509 1159 [ + + ]: 3368588 : if (postingoff != 0)
1160 : : {
1161 : 12056 : ItemId itemid = PageGetItemId(page, newitemoff);
1162 : :
1163 : : /*
1164 : : * The new tuple is a duplicate with a heap TID that falls inside the
1165 : : * range of an existing posting list tuple on a leaf page. Prepare to
1166 : : * split an existing posting list. Overwriting the posting list with
1167 : : * its post-split version is treated as an extra step in either the
1168 : : * insert or page split critical section.
1169 : : */
900 1170 [ + - + - : 12056 : Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
- + ]
1509 1171 : 12056 : oposting = (IndexTuple) PageGetItem(page, itemid);
1172 : :
1173 : : /*
1174 : : * postingoff value comes from earlier call to _bt_binsrch_posting().
1175 : : * Its binary search might think that a plain tuple must be a posting
1176 : : * list tuple that needs to be split. This can happen with corruption
1177 : : * involving an existing plain tuple that is a duplicate of the new
1178 : : * item, up to and including its table TID. Check for that here in
1179 : : * passing.
1180 : : *
1181 : : * Also verify that our caller has made sure that the existing posting
1182 : : * list tuple does not have its LP_DEAD bit set.
1183 : : */
900 1184 [ + - - + ]: 12056 : if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
900 pg@bowt.ie 1185 [ # # ]:UBC 0 : ereport(ERROR,
1186 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1187 : : errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1188 : : ItemPointerGetBlockNumber(&itup->t_tid),
1189 : : ItemPointerGetOffsetNumber(&itup->t_tid),
1190 : : newitemoff, BufferGetBlockNumber(buf),
1191 : : RelationGetRelationName(rel))));
1192 : :
1193 : : /* use a mutable copy of itup as our itup from here on */
1509 pg@bowt.ie 1194 :CBC 12056 : origitup = itup;
1195 : 12056 : itup = CopyIndexTuple(origitup);
1196 : 12056 : nposting = _bt_swap_posting(itup, oposting, postingoff);
1197 : : /* itup now contains rightmost/max TID from oposting */
1198 : :
1199 : : /* Alter offset so that newitem goes after posting list */
1200 : 12056 : newitemoff = OffsetNumberNext(newitemoff);
1201 : : }
1202 : :
1203 : : /*
1204 : : * Do we need to split the page to fit the item on it?
1205 : : *
1206 : : * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1207 : : * so this comparison is correct even though we appear to be accounting
1208 : : * only for the item and not for its line pointer.
1209 : : */
8668 tgl@sss.pgh.pa.us 1210 [ + + ]: 3368588 : if (PageGetFreeSpace(page) < itemsz)
1211 : : {
1212 : : Buffer rbuf;
1213 : :
1462 pg@bowt.ie 1214 [ - + ]: 10590 : Assert(!split_only_page);
1215 : :
1216 : : /* split the buffer into left and right halves */
379 andres@anarazel.de 1217 : 10590 : rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1218 : : itup, origitup, nposting, postingoff);
4815 heikki.linnakangas@i 1219 : 10590 : PredicateLockPageSplit(rel,
1220 : : BufferGetBlockNumber(buf),
1221 : : BufferGetBlockNumber(rbuf));
1222 : :
1223 : : /*----------
1224 : : * By here,
1225 : : *
1226 : : * + our target page has been split;
1227 : : * + the original tuple has been inserted;
1228 : : * + we have write locks on both the old (left half)
1229 : : * and new (right half) buffers, after the split; and
1230 : : * + we know the key we want to insert into the parent
1231 : : * (it's the "high key" on the left child page).
1232 : : *
1233 : : * We're ready to do the parent insertion. We need to hold onto the
1234 : : * locks for the child pages until we locate the parent, but we can
1235 : : * at least release the lock on the right child before doing the
1236 : : * actual insertion. The lock on the left child will be released
1237 : : * last of all by parent insertion, where it is the 'cbuf' of parent
1238 : : * page.
1239 : : *----------
1240 : : */
379 andres@anarazel.de 1241 : 10590 : _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1242 : : }
1243 : : else
1244 : : {
7723 tgl@sss.pgh.pa.us 1245 : 3357998 : Buffer metabuf = InvalidBuffer;
1246 : 3357998 : Page metapg = NULL;
1247 : 3357998 : BTMetaPageData *metad = NULL;
1248 : : BlockNumber blockcache;
1249 : :
1250 : : /*
1251 : : * If we are doing this insert because we split a page that was the
1252 : : * only one on its tree level, but was not the root, it may have been
1253 : : * the "fast root". We need to ensure that the fast root link points
1254 : : * at or above the current page. We can safely acquire a lock on the
1255 : : * metapage here --- see comments for _bt_newlevel().
1256 : : */
1244 pg@bowt.ie 1257 [ + + ]: 3357998 : if (unlikely(split_only_page))
1258 : : {
1488 1259 [ - + ]: 12 : Assert(!isleaf);
1260 [ - + ]: 12 : Assert(BufferIsValid(cbuf));
1261 : :
309 1262 : 12 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2916 kgrittn@postgresql.o 1263 : 12 : metapg = BufferGetPage(metabuf);
7723 tgl@sss.pgh.pa.us 1264 : 12 : metad = BTPageGetMeta(metapg);
1265 : :
1145 pg@bowt.ie 1266 [ - + ]: 12 : if (metad->btm_fastlevel >= opaque->btpo_level)
1267 : : {
1268 : : /* no update wanted */
7723 tgl@sss.pgh.pa.us 1269 :UBC 0 : _bt_relbuf(rel, metabuf);
1270 : 0 : metabuf = InvalidBuffer;
1271 : : }
1272 : : }
1273 : :
1274 : : /* Do the update. No ereport(ERROR) until changes are logged */
7723 tgl@sss.pgh.pa.us 1275 :CBC 3357998 : START_CRIT_SECTION();
1276 : :
1509 pg@bowt.ie 1277 [ + + ]: 3357998 : if (postingoff != 0)
1278 : 12033 : memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1279 : :
1488 1280 [ - + ]: 3357998 : if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1281 : : false) == InvalidOffsetNumber)
4977 tgl@sss.pgh.pa.us 1282 [ # # ]:UBC 0 : elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1283 : : BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1284 : :
6589 tgl@sss.pgh.pa.us 1285 :CBC 3357998 : MarkBufferDirty(buf);
1286 : :
7723 1287 [ + + ]: 3357998 : if (BufferIsValid(metabuf))
1288 : : {
1289 : : /* upgrade meta-page if needed */
1852 pg@bowt.ie 1290 [ - + ]: 12 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2202 teodor@sigaev.ru 1291 :UBC 0 : _bt_upgrademetapage(metapg);
1489 pg@bowt.ie 1292 :CBC 12 : metad->btm_fastroot = BufferGetBlockNumber(buf);
1145 1293 : 12 : metad->btm_fastlevel = opaque->btpo_level;
6589 tgl@sss.pgh.pa.us 1294 : 12 : MarkBufferDirty(metabuf);
1295 : : }
1296 : :
1297 : : /*
1298 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1299 : : * finishes a split
1300 : : */
1462 pg@bowt.ie 1301 [ + + ]: 3357998 : if (!isleaf)
1302 : : {
2916 kgrittn@postgresql.o 1303 : 9966 : Page cpage = BufferGetPage(cbuf);
744 michael@paquier.xyz 1304 : 9966 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1305 : :
3680 heikki.linnakangas@i 1306 [ - + ]: 9966 : Assert(P_INCOMPLETE_SPLIT(cpageop));
1307 : 9966 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1308 : 9966 : MarkBufferDirty(cbuf);
1309 : : }
1310 : :
1311 : : /* XLOG stuff */
4871 rhaas@postgresql.org 1312 [ + + + + : 3357998 : if (RelationNeedsWAL(rel))
+ + + + ]
1313 : : {
1314 : : xl_btree_insert xlrec;
1315 : : xl_btree_metadata xlmeta;
1316 : : uint8 xlinfo;
1317 : : XLogRecPtr recptr;
1318 : : uint16 upostingoff;
1319 : :
1489 pg@bowt.ie 1320 : 3038045 : xlrec.offnum = newitemoff;
1321 : :
3433 heikki.linnakangas@i 1322 : 3038045 : XLogBeginInsert();
1323 : 3038045 : XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
1324 : :
1488 pg@bowt.ie 1325 [ + + + + ]: 3038045 : if (isleaf && postingoff == 0)
1326 : : {
1327 : : /* Simple leaf insert */
6576 tgl@sss.pgh.pa.us 1328 : 3016864 : xlinfo = XLOG_BTREE_INSERT_LEAF;
1329 : : }
1509 pg@bowt.ie 1330 [ + + ]: 21181 : else if (postingoff != 0)
1331 : : {
1332 : : /*
1333 : : * Leaf insert with posting list split. Must include
1334 : : * postingoff field before newitem/orignewitem.
1335 : : */
1461 1336 [ - + ]: 12033 : Assert(isleaf);
1509 1337 : 12033 : xlinfo = XLOG_BTREE_INSERT_POST;
1338 : : }
1339 : : else
1340 : : {
1341 : : /* Internal page insert, which finishes a split on cbuf */
6576 tgl@sss.pgh.pa.us 1342 : 9148 : xlinfo = XLOG_BTREE_INSERT_UPPER;
1462 pg@bowt.ie 1343 : 9148 : XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1344 : :
1461 1345 [ + + ]: 9148 : if (BufferIsValid(metabuf))
1346 : : {
1347 : : /* Actually, it's an internal page insert + meta update */
1348 : 12 : xlinfo = XLOG_BTREE_INSERT_META;
1349 : :
1350 [ - + ]: 12 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1351 : 12 : xlmeta.version = metad->btm_version;
1352 : 12 : xlmeta.root = metad->btm_root;
1353 : 12 : xlmeta.level = metad->btm_level;
1354 : 12 : xlmeta.fastroot = metad->btm_fastroot;
1355 : 12 : xlmeta.fastlevel = metad->btm_fastlevel;
1145 1356 : 12 : xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1461 1357 : 12 : xlmeta.allequalimage = metad->btm_allequalimage;
1358 : :
1359 : 12 : XLogRegisterBuffer(2, metabuf,
1360 : : REGBUF_WILL_INIT | REGBUF_STANDARD);
1361 : 12 : XLogRegisterBufData(2, (char *) &xlmeta,
1362 : : sizeof(xl_btree_metadata));
1363 : : }
1364 : : }
1365 : :
3433 heikki.linnakangas@i 1366 : 3038045 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1509 pg@bowt.ie 1367 [ + + ]: 3038045 : if (postingoff == 0)
1368 : : {
1369 : : /* Just log itup from caller */
1370 : 3026012 : XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
1371 : : }
1372 : : else
1373 : : {
1374 : : /*
1375 : : * Insert with posting list split (XLOG_BTREE_INSERT_POST
1376 : : * record) case.
1377 : : *
1378 : : * Log postingoff. Also log origitup, not itup. REDO routine
1379 : : * must reconstruct final itup (as well as nposting) using
1380 : : * _bt_swap_posting().
1381 : : */
1445 1382 : 12033 : upostingoff = postingoff;
1383 : :
1509 1384 : 12033 : XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
1385 : 12033 : XLogRegisterBufData(0, (char *) origitup,
1386 : 12033 : IndexTupleSize(origitup));
1387 : : }
1388 : :
3433 heikki.linnakangas@i 1389 : 3038045 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1390 : :
7723 tgl@sss.pgh.pa.us 1391 [ + + ]: 3038045 : if (BufferIsValid(metabuf))
1392 : 12 : PageSetLSN(metapg, recptr);
1462 pg@bowt.ie 1393 [ + + ]: 3038045 : if (!isleaf)
2916 kgrittn@postgresql.o 1394 : 9148 : PageSetLSN(BufferGetPage(cbuf), recptr);
1395 : :
7723 tgl@sss.pgh.pa.us 1396 : 3038045 : PageSetLSN(page, recptr);
1397 : : }
1398 : :
1399 [ - + ]: 3357998 : END_CRIT_SECTION();
1400 : :
1401 : : /* Release subsidiary buffers */
1402 [ + + ]: 3357998 : if (BufferIsValid(metabuf))
6589 1403 : 12 : _bt_relbuf(rel, metabuf);
1462 pg@bowt.ie 1404 [ + + ]: 3357998 : if (!isleaf)
3680 heikki.linnakangas@i 1405 : 9966 : _bt_relbuf(rel, cbuf);
1406 : :
1407 : : /*
1408 : : * Cache the block number if this is the rightmost leaf page. Cache
1409 : : * may be used by a future inserter within _bt_search_insert().
1410 : : */
1488 pg@bowt.ie 1411 : 3357998 : blockcache = InvalidBlockNumber;
1244 1412 [ + + + + : 3357998 : if (isrightmost && isleaf && !isroot)
+ + ]
1488 1413 : 1916143 : blockcache = BufferGetBlockNumber(buf);
1414 : :
1415 : : /* Release buffer for insertion target block */
6589 tgl@sss.pgh.pa.us 1416 : 3357998 : _bt_relbuf(rel, buf);
1417 : :
1418 : : /*
1419 : : * If we decided to cache the insertion target block before releasing
1420 : : * its buffer lock, then cache it now. Check the height of the tree
1421 : : * first, though. We don't go for the optimization with small
1422 : : * indexes. Defer final check to this point to ensure that we don't
1423 : : * call _bt_getrootheight while holding a buffer lock.
1424 : : */
1488 pg@bowt.ie 1425 [ + + + + ]: 5274141 : if (BlockNumberIsValid(blockcache) &&
309 1426 : 1916143 : _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
1488 1427 : 34419 : RelationSetTargetBlock(rel, blockcache);
1428 : : }
1429 : :
1430 : : /* be tidy */
1509 1431 [ + + ]: 3368588 : if (postingoff != 0)
1432 : : {
1433 : : /* itup is actually a modified copy of caller's original */
1434 : 12056 : pfree(nposting);
1435 : 12056 : pfree(itup);
1436 : : }
10141 scrappy@hub.org 1437 : 3368588 : }
1438 : :
1439 : : /*
1440 : : * _bt_split() -- split a page in the btree.
1441 : : *
1442 : : * On entry, buf is the page to split, and is pinned and write-locked.
1443 : : * newitemoff etc. tell us about the new item that must be inserted
1444 : : * along with the data from the original page.
1445 : : *
1446 : : * itup_key is used for suffix truncation on leaf pages (internal
1447 : : * page callers pass NULL). When splitting a non-leaf page, 'cbuf'
1448 : : * is the left-sibling of the page we're inserting the downlink for.
1449 : : * This function will clear the INCOMPLETE_SPLIT flag on it, and
1450 : : * release the buffer.
1451 : : *
1452 : : * orignewitem, nposting, and postingoff are needed when an insert of
1453 : : * orignewitem results in both a posting list split and a page split.
1454 : : * These extra posting list split details are used here in the same
1455 : : * way as they are used in the more common case where a posting list
1456 : : * split does not coincide with a page split. We need to deal with
1457 : : * posting list splits directly in order to ensure that everything
1458 : : * that follows from the insert of orignewitem is handled as a single
1459 : : * atomic operation (though caller's insert of a new pivot/downlink
1460 : : * into parent page will still be a separate operation). See
1461 : : * nbtree/README for details on the design of posting list splits.
1462 : : *
1463 : : * Returns the new right sibling of buf, pinned and write-locked.
1464 : : * The pin and lock on buf are maintained.
1465 : : */
1466 : : static Buffer
379 andres@anarazel.de 1467 : 10590 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1468 : : Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1469 : : IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1470 : : {
1471 : : Buffer rbuf;
1472 : : Page origpage;
1473 : : Page leftpage,
1474 : : rightpage;
1475 : : BlockNumber origpagenumber,
1476 : : rightpagenumber;
1477 : : BTPageOpaque ropaque,
1478 : : lopaque,
1479 : : oopaque;
7722 tgl@sss.pgh.pa.us 1480 : 10590 : Buffer sbuf = InvalidBuffer;
1481 : 10590 : Page spage = NULL;
1482 : 10590 : BTPageOpaque sopaque = NULL;
1483 : : Size itemsz;
1484 : : ItemId itemid;
1485 : : IndexTuple firstright,
1486 : : lefthighkey;
1487 : : OffsetNumber firstrightoff;
1488 : : OffsetNumber afterleftoff,
1489 : : afterrightoff,
1490 : : minusinfoff;
1491 : : OffsetNumber origpagepostingoff;
1492 : : OffsetNumber maxoff;
1493 : : OffsetNumber i;
1494 : : bool newitemonleft,
1495 : : isleaf,
1496 : : isrightmost;
1497 : :
1498 : : /*
1499 : : * origpage is the original page to be split. leftpage is a temporary
1500 : : * buffer that receives the left-sibling data, which will be copied back
1501 : : * into origpage on success. rightpage is the new page that will receive
1502 : : * the right-sibling data.
1503 : : *
1504 : : * leftpage is allocated after choosing a split point. rightpage's new
1505 : : * buffer isn't acquired until after leftpage is initialized and has new
1506 : : * high key, the last point where splitting the page may fail (barring
1507 : : * corruption). Failing before acquiring new buffer won't have lasting
1508 : : * consequences, since origpage won't have been modified and leftpage is
1509 : : * only workspace.
1510 : : */
2916 kgrittn@postgresql.o 1511 : 10590 : origpage = BufferGetPage(buf);
744 michael@paquier.xyz 1512 : 10590 : oopaque = BTPageGetOpaque(origpage);
1462 pg@bowt.ie 1513 : 10590 : isleaf = P_ISLEAF(oopaque);
1514 : 10590 : isrightmost = P_RIGHTMOST(oopaque);
1515 : 10590 : maxoff = PageGetMaxOffsetNumber(origpage);
4977 tgl@sss.pgh.pa.us 1516 : 10590 : origpagenumber = BufferGetBlockNumber(buf);
1517 : :
1518 : : /*
1519 : : * Choose a point to split origpage at.
1520 : : *
1521 : : * A split point can be thought of as a point _between_ two existing data
1522 : : * items on origpage (the lastleft and firstright tuples), provided you
1523 : : * pretend that the new item that didn't fit is already on origpage.
1524 : : *
1525 : : * Since origpage does not actually contain newitem, the representation of
1526 : : * split points needs to work with two boundary cases: splits where
1527 : : * newitem is lastleft, and splits where newitem is firstright.
1528 : : * newitemonleft resolves the ambiguity that would otherwise exist when
1529 : : * newitemoff == firstrightoff. In all other cases it's clear which side
1530 : : * of the split every tuple goes on from context. newitemonleft is
1531 : : * usually (but not always) redundant information.
1532 : : *
1533 : : * firstrightoff is supposed to be an origpage offset number, but it's
1534 : : * possible that its value will be maxoff+1, which is "past the end" of
1535 : : * origpage. This happens in the rare case where newitem goes after all
1536 : : * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1537 : : * origpage at the point that leaves newitem alone on new right page. Any
1538 : : * "!newitemonleft && newitemoff == firstrightoff" split point makes
1539 : : * newitem the firstright tuple, though, so this case isn't a special
1540 : : * case.
1541 : : */
1462 pg@bowt.ie 1542 : 10590 : firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1543 : : newitem, &newitemonleft);
1544 : :
1545 : : /* Allocate temp buffer for leftpage */
1798 1546 : 10590 : leftpage = PageGetTempPage(origpage);
1547 : 10590 : _bt_pageinit(leftpage, BufferGetPageSize(buf));
744 michael@paquier.xyz 1548 : 10590 : lopaque = BTPageGetOpaque(leftpage);
1549 : :
1550 : : /*
1551 : : * leftpage won't be the root when we're done. Also, clear the SPLIT_END
1552 : : * and HAS_GARBAGE flags.
1553 : : */
8593 vadim4o@yahoo.com 1554 : 10590 : lopaque->btpo_flags = oopaque->btpo_flags;
6473 tgl@sss.pgh.pa.us 1555 : 10590 : lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1556 : : /* set flag in leftpage indicating that rightpage has no downlink yet */
3680 heikki.linnakangas@i 1557 : 10590 : lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
9716 bruce@momjian.us 1558 : 10590 : lopaque->btpo_prev = oopaque->btpo_prev;
1559 : : /* handle btpo_next after rightpage buffer acquired */
1145 pg@bowt.ie 1560 : 10590 : lopaque->btpo_level = oopaque->btpo_level;
1561 : : /* handle btpo_cycleid after rightpage buffer acquired */
1562 : :
1563 : : /*
1564 : : * Copy the original page's LSN into leftpage, which will become the
1565 : : * updated version of the page. We need this because XLogInsert will
1566 : : * examine the LSN and possibly dump it in a page image.
1567 : : */
1798 1568 : 10590 : PageSetLSN(leftpage, PageGetLSN(origpage));
1569 : :
1570 : : /*
1571 : : * Determine page offset number of existing overlapped-with-orignewitem
1572 : : * posting list when it is necessary to perform a posting list split in
1573 : : * passing. Note that newitem was already changed by caller (newitem no
1574 : : * longer has the orignewitem TID).
1575 : : *
1576 : : * This page offset number (origpagepostingoff) will be used to pretend
1577 : : * that the posting split has already taken place, even though the
1578 : : * required modifications to origpage won't occur until we reach the
1579 : : * critical section. The lastleft and firstright tuples of our page split
1580 : : * point should, in effect, come from an imaginary version of origpage
1581 : : * that has the nposting tuple instead of the original posting list tuple.
1582 : : *
1583 : : * Note: _bt_findsplitloc() should have compensated for coinciding posting
1584 : : * list splits in just the same way, at least in theory. It doesn't
1585 : : * bother with that, though. In practice it won't affect its choice of
1586 : : * split point.
1587 : : */
1509 1588 : 10590 : origpagepostingoff = InvalidOffsetNumber;
1589 [ + + ]: 10590 : if (postingoff != 0)
1590 : : {
1591 [ - + ]: 23 : Assert(isleaf);
1592 [ - + ]: 23 : Assert(ItemPointerCompare(&orignewitem->t_tid,
1593 : : &newitem->t_tid) < 0);
1594 [ - + ]: 23 : Assert(BTreeTupleIsPosting(nposting));
1595 : 23 : origpagepostingoff = OffsetNumberPrev(newitemoff);
1596 : : }
1597 : :
1598 : : /*
1599 : : * The high key for the new left page is a possibly-truncated copy of
1600 : : * firstright on the leaf level (it's "firstright itself" on internal
1601 : : * pages; see !isleaf comments below). This may seem to be contrary to
1602 : : * Lehman & Yao's approach of using a copy of lastleft as the new high key
1603 : : * when splitting on the leaf level. It isn't, though.
1604 : : *
1605 : : * Suffix truncation will leave the left page's high key fully equal to
1606 : : * lastleft when lastleft and firstright are equal prior to heap TID (that
1607 : : * is, the tiebreaker TID value comes from lastleft). It isn't actually
1608 : : * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1609 : : * "subtree" invariant to hold. It's sufficient to make sure that the new
1610 : : * leaf high key is strictly less than firstright, and greater than or
1611 : : * equal to (not necessarily equal to) lastleft. In other words, when
1612 : : * suffix truncation isn't possible during a leaf page split, we take
1613 : : * L&Y's exact approach to generating a new high key for the left page.
1614 : : * (Actually, that is slightly inaccurate. We don't just use a copy of
1615 : : * lastleft. A tuple with all the keys from firstright but the max heap
1616 : : * TID from lastleft is used, to avoid introducing a special case.)
1617 : : */
1462 1618 [ + + + + ]: 10590 : if (!newitemonleft && newitemoff == firstrightoff)
1619 : : {
1620 : : /* incoming tuple becomes firstright */
8668 tgl@sss.pgh.pa.us 1621 : 16 : itemsz = newitemsz;
1462 pg@bowt.ie 1622 : 16 : firstright = newitem;
1623 : : }
1624 : : else
1625 : : {
1626 : : /* existing item at firstrightoff becomes firstright */
1627 : 10574 : itemid = PageGetItemId(origpage, firstrightoff);
8668 tgl@sss.pgh.pa.us 1628 : 10574 : itemsz = ItemIdGetLength(itemid);
1462 pg@bowt.ie 1629 : 10574 : firstright = (IndexTuple) PageGetItem(origpage, itemid);
1630 [ - + ]: 10574 : if (firstrightoff == origpagepostingoff)
1462 pg@bowt.ie 1631 :UBC 0 : firstright = nposting;
1632 : : }
1633 : :
1462 pg@bowt.ie 1634 [ + + ]:CBC 10590 : if (isleaf)
1635 : : {
1636 : : IndexTuple lastleft;
1637 : :
1638 : : /* Attempt suffix truncation for leaf page splits */
1639 [ + + + + ]: 10461 : if (newitemonleft && newitemoff == firstrightoff)
1640 : : {
1641 : : /* incoming tuple becomes lastleft */
1852 1642 : 185 : lastleft = newitem;
1643 : : }
1644 : : else
1645 : : {
1646 : : OffsetNumber lastleftoff;
1647 : :
1648 : : /* existing item before firstrightoff becomes lastleft */
1462 1649 : 10276 : lastleftoff = OffsetNumberPrev(firstrightoff);
1852 1650 [ + + - + ]: 10276 : Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1651 : 10276 : itemid = PageGetItemId(origpage, lastleftoff);
1652 : 10276 : lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1509 1653 [ + + ]: 10276 : if (lastleftoff == origpagepostingoff)
1509 pg@bowt.ie 1654 :GBC 2 : lastleft = nposting;
1655 : : }
1656 : :
1462 pg@bowt.ie 1657 :CBC 10461 : lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1658 : 10461 : itemsz = IndexTupleSize(lefthighkey);
1659 : : }
1660 : : else
1661 : : {
1662 : : /*
1663 : : * Don't perform suffix truncation on a copy of firstright to make
1664 : : * left page high key for internal page splits. Must use firstright
1665 : : * as new high key directly.
1666 : : *
1667 : : * Each distinct separator key value originates as a leaf level high
1668 : : * key; all other separator keys/pivot tuples are copied from one
1669 : : * level down. A separator key in a grandparent page must be
1670 : : * identical to high key in rightmost parent page of the subtree to
1671 : : * its left, which must itself be identical to high key in rightmost
1672 : : * child page of that same subtree (this even applies to separator
1673 : : * from grandparent's high key). There must always be an unbroken
1674 : : * "seam" of identical separator keys that guide index scans at every
1675 : : * level, starting from the grandparent. That's why suffix truncation
1676 : : * is unsafe here.
1677 : : *
1678 : : * Internal page splits will truncate firstright into a "negative
1679 : : * infinity" data item when it gets inserted on the new right page
1680 : : * below, though. This happens during the call to _bt_pgaddtup() for
1681 : : * the new first data item for right page. Do not confuse this
1682 : : * mechanism with suffix truncation. It is just a convenient way of
1683 : : * implementing page splits that split the internal page "inside"
1684 : : * firstright. The lefthighkey separator key cannot appear a second
1685 : : * time in the right page (only firstright's downlink goes in right
1686 : : * page).
1687 : : */
1688 : 129 : lefthighkey = firstright;
1689 : : }
1690 : :
1691 : : /*
1692 : : * Add new high key to leftpage
1693 : : */
1694 : 10590 : afterleftoff = P_HIKEY;
1695 : :
1696 [ + - - + ]: 10590 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1697 [ + - - + ]: 10590 : Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1698 : : IndexRelationGetNumberOfKeyAttributes(rel));
1699 [ - + ]: 10590 : Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1700 [ - + ]: 10590 : if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1701 : : false) == InvalidOffsetNumber)
1462 pg@bowt.ie 1702 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the left sibling"
1703 : : " while splitting block %u of index \"%s\"",
1704 : : origpagenumber, RelationGetRelationName(rel));
1462 pg@bowt.ie 1705 :CBC 10590 : afterleftoff = OffsetNumberNext(afterleftoff);
1706 : :
1707 : : /*
1708 : : * Acquire a new right page to split into, now that left page has a new
1709 : : * high key. From here on, it's not okay to throw an error without
1710 : : * zeroing rightpage first. This coding rule ensures that we won't
1711 : : * confuse future VACUUM operations, which might otherwise try to re-find
1712 : : * a downlink to a leftover junk page as the page undergoes deletion.
1713 : : *
1714 : : * It would be reasonable to start the critical section just after the new
1715 : : * rightpage buffer is acquired instead; that would allow us to avoid
1716 : : * leftover junk pages without bothering to zero rightpage. We do it this
1717 : : * way because it avoids an unnecessary PANIC when either origpage or its
1718 : : * existing sibling page are corrupt.
1719 : : */
309 1720 : 10590 : rbuf = _bt_allocbuf(rel, heaprel);
1798 1721 : 10590 : rightpage = BufferGetPage(rbuf);
1722 : 10590 : rightpagenumber = BufferGetBlockNumber(rbuf);
1723 : : /* rightpage was initialized by _bt_getbuf */
744 michael@paquier.xyz 1724 : 10590 : ropaque = BTPageGetOpaque(rightpage);
1725 : :
1726 : : /*
1727 : : * Finish off remaining leftpage special area fields. They cannot be set
1728 : : * before both origpage (leftpage) and rightpage buffers are acquired and
1729 : : * locked.
1730 : : *
1731 : : * btpo_cycleid is only used with leaf pages, though we set it here in all
1732 : : * cases just to be consistent.
1733 : : */
1798 pg@bowt.ie 1734 : 10590 : lopaque->btpo_next = rightpagenumber;
1735 : 10590 : lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1736 : :
1737 : : /*
1738 : : * rightpage won't be the root when we're done. Also, clear the SPLIT_END
1739 : : * and HAS_GARBAGE flags.
1740 : : */
1741 : 10590 : ropaque->btpo_flags = oopaque->btpo_flags;
1742 : 10590 : ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1743 : 10590 : ropaque->btpo_prev = origpagenumber;
1744 : 10590 : ropaque->btpo_next = oopaque->btpo_next;
1145 1745 : 10590 : ropaque->btpo_level = oopaque->btpo_level;
1798 1746 : 10590 : ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1747 : :
1748 : : /*
1749 : : * Add new high key to rightpage where necessary.
1750 : : *
1751 : : * If the page we're splitting is not the rightmost page at its level in
1752 : : * the tree, then the first entry on the page is the high key from
1753 : : * origpage.
1754 : : */
1462 1755 : 10590 : afterrightoff = P_HIKEY;
1756 : :
1757 [ + + ]: 10590 : if (!isrightmost)
1758 : : {
1759 : : IndexTuple righthighkey;
1760 : :
1798 1761 : 4505 : itemid = PageGetItemId(origpage, P_HIKEY);
1762 : 4505 : itemsz = ItemIdGetLength(itemid);
1462 1763 : 4505 : righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1764 [ + - - + ]: 4505 : Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1765 [ + - - + ]: 4505 : Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1766 : : IndexRelationGetNumberOfKeyAttributes(rel));
1767 [ - + ]: 4505 : if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1768 : : false, false) == InvalidOffsetNumber)
1769 : : {
1798 pg@bowt.ie 1770 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1462 1771 [ # # ]: 0 : elog(ERROR, "failed to add high key to the right sibling"
1772 : : " while splitting block %u of index \"%s\"",
1773 : : origpagenumber, RelationGetRelationName(rel));
1774 : : }
1462 pg@bowt.ie 1775 :CBC 4505 : afterrightoff = OffsetNumberNext(afterrightoff);
1776 : : }
1777 : :
1778 : : /*
1779 : : * Internal page splits truncate first data item on right page -- it
1780 : : * becomes "minus infinity" item for the page. Set this up here.
1781 : : */
1782 : 10590 : minusinfoff = InvalidOffsetNumber;
1783 [ + + ]: 10590 : if (!isleaf)
1784 : 129 : minusinfoff = afterrightoff;
1785 : :
1786 : : /*
1787 : : * Now transfer all the data items (non-pivot tuples in isleaf case, or
1788 : : * additional pivot tuples in !isleaf case) to the appropriate page.
1789 : : *
1790 : : * Note: we *must* insert at least the right page's items in item-number
1791 : : * order, for the benefit of _bt_restore_page().
1792 : : */
8668 tgl@sss.pgh.pa.us 1793 [ + + + + ]: 3275633 : for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1794 : : {
1795 : : IndexTuple dataitem;
1796 : :
9716 bruce@momjian.us 1797 : 3265043 : itemid = PageGetItemId(origpage, i);
1798 : 3265043 : itemsz = ItemIdGetLength(itemid);
1462 pg@bowt.ie 1799 : 3265043 : dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1800 : :
1801 : : /* replace original item with nposting due to posting split? */
1509 1802 [ + + ]: 3265043 : if (i == origpagepostingoff)
1803 : : {
1462 1804 [ - + ]: 23 : Assert(BTreeTupleIsPosting(dataitem));
1509 1805 [ - + ]: 23 : Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1462 1806 : 23 : dataitem = nposting;
1807 : : }
1808 : :
1809 : : /* does new item belong before this one? */
1509 1810 [ + + ]: 3265020 : else if (i == newitemoff)
1811 : : {
8668 tgl@sss.pgh.pa.us 1812 [ + + ]: 5659 : if (newitemonleft)
1813 : : {
1462 pg@bowt.ie 1814 [ - + ]: 1628 : Assert(newitemoff <= firstrightoff);
1815 [ - + ]: 1628 : if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1816 : : false))
1817 : : {
4977 tgl@sss.pgh.pa.us 1818 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1819 [ # # ]: 0 : elog(ERROR, "failed to add new item to the left sibling"
1820 : : " while splitting block %u of index \"%s\"",
1821 : : origpagenumber, RelationGetRelationName(rel));
1822 : : }
1462 pg@bowt.ie 1823 :CBC 1628 : afterleftoff = OffsetNumberNext(afterleftoff);
1824 : : }
1825 : : else
1826 : : {
1827 [ - + ]: 4031 : Assert(newitemoff >= firstrightoff);
1828 [ - + ]: 4031 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1829 : : afterrightoff == minusinfoff))
1830 : : {
4977 tgl@sss.pgh.pa.us 1831 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1832 [ # # ]: 0 : elog(ERROR, "failed to add new item to the right sibling"
1833 : : " while splitting block %u of index \"%s\"",
1834 : : origpagenumber, RelationGetRelationName(rel));
1835 : : }
1462 pg@bowt.ie 1836 :CBC 4031 : afterrightoff = OffsetNumberNext(afterrightoff);
1837 : : }
1838 : : }
1839 : :
1840 : : /* decide which page to put it on */
1841 [ + + ]: 3265043 : if (i < firstrightoff)
1842 : : {
1843 [ - + ]: 2492894 : if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1844 : : {
4977 tgl@sss.pgh.pa.us 1845 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1846 [ # # ]: 0 : elog(ERROR, "failed to add old item to the left sibling"
1847 : : " while splitting block %u of index \"%s\"",
1848 : : origpagenumber, RelationGetRelationName(rel));
1849 : : }
1462 pg@bowt.ie 1850 :CBC 2492894 : afterleftoff = OffsetNumberNext(afterleftoff);
1851 : : }
1852 : : else
1853 : : {
1854 [ - + ]: 772149 : if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1855 : : afterrightoff == minusinfoff))
1856 : : {
4977 tgl@sss.pgh.pa.us 1857 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1858 [ # # ]: 0 : elog(ERROR, "failed to add old item to the right sibling"
1859 : : " while splitting block %u of index \"%s\"",
1860 : : origpagenumber, RelationGetRelationName(rel));
1861 : : }
1462 pg@bowt.ie 1862 :CBC 772149 : afterrightoff = OffsetNumberNext(afterrightoff);
1863 : : }
1864 : : }
1865 : :
1866 : : /* Handle case where newitem goes at the end of rightpage */
8668 tgl@sss.pgh.pa.us 1867 [ + + ]: 10590 : if (i <= newitemoff)
1868 : : {
1869 : : /*
1870 : : * Can't have newitemonleft here; that would imply we were told to put
1871 : : * *everything* on the left page, which cannot fit (if it could, we'd
1872 : : * not be splitting the page).
1873 : : */
1462 pg@bowt.ie 1874 [ + - - + ]: 4931 : Assert(!newitemonleft && newitemoff == maxoff + 1);
1875 [ - + ]: 4931 : if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1876 : : afterrightoff == minusinfoff))
1877 : : {
4977 tgl@sss.pgh.pa.us 1878 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1879 [ # # ]: 0 : elog(ERROR, "failed to add new item to the right sibling"
1880 : : " while splitting block %u of index \"%s\"",
1881 : : origpagenumber, RelationGetRelationName(rel));
1882 : : }
1462 pg@bowt.ie 1883 :CBC 4931 : afterrightoff = OffsetNumberNext(afterrightoff);
1884 : : }
1885 : :
1886 : : /*
1887 : : * We have to grab the original right sibling (if any) and update its prev
1888 : : * link. We are guaranteed that this is deadlock-free, since we couple
1889 : : * the locks in the standard order: left to right.
1890 : : */
1891 [ + + ]: 10590 : if (!isrightmost)
1892 : : {
309 1893 : 4505 : sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
2916 kgrittn@postgresql.o 1894 : 4505 : spage = BufferGetPage(sbuf);
744 michael@paquier.xyz 1895 : 4505 : sopaque = BTPageGetOpaque(spage);
4977 tgl@sss.pgh.pa.us 1896 [ - + ]: 4505 : if (sopaque->btpo_prev != origpagenumber)
1897 : : {
4977 tgl@sss.pgh.pa.us 1898 :UBC 0 : memset(rightpage, 0, BufferGetPageSize(rbuf));
1718 peter@eisentraut.org 1899 [ # # ]: 0 : ereport(ERROR,
1900 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1901 : : errmsg_internal("right sibling's left-link doesn't match: "
1902 : : "block %u links to %u instead of expected %u in index \"%s\"",
1903 : : oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1904 : : RelationGetRelationName(rel))));
1905 : : }
1906 : :
1907 : : /*
1908 : : * Check to see if we can set the SPLIT_END flag in the right-hand
1909 : : * split page; this can save some I/O for vacuum since it need not
1910 : : * proceed to the right sibling. We can set the flag if the right
1911 : : * sibling has a different cycleid: that means it could not be part of
1912 : : * a group of pages that were all split off from the same ancestor
1913 : : * page. If you're confused, imagine that page A splits to A B and
1914 : : * then again, yielding A C B, while vacuum is in progress. Tuples
1915 : : * originally in A could now be in either B or C, hence vacuum must
1916 : : * examine both pages. But if D, our right sibling, has a different
1917 : : * cycleid then it could not contain any tuples that were in A when
1918 : : * the vacuum started.
1919 : : */
6551 tgl@sss.pgh.pa.us 1920 [ - + ]:CBC 4505 : if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
6551 tgl@sss.pgh.pa.us 1921 :UBC 0 : ropaque->btpo_flags |= BTP_SPLIT_END;
1922 : : }
1923 : :
1924 : : /*
1925 : : * Right sibling is locked, new siblings are prepared, but original page
1926 : : * is not updated yet.
1927 : : *
1928 : : * NO EREPORT(ERROR) till right sibling is updated. We can get away with
1929 : : * not starting the critical section till here because we haven't been
1930 : : * scribbling on the original page yet; see comments above.
1931 : : */
8493 tgl@sss.pgh.pa.us 1932 :CBC 10590 : START_CRIT_SECTION();
1933 : :
1934 : : /*
1935 : : * By here, the original data page has been split into two new halves, and
1936 : : * these are correct. The algorithm requires that the left page never
1937 : : * move during a split, so we copy the new left page back on top of the
1938 : : * original. We need to do this before writing the WAL record, so that
1939 : : * XLogInsert can WAL log an image of the page if necessary.
1940 : : */
6275 bruce@momjian.us 1941 : 10590 : PageRestoreTempPage(leftpage, origpage);
1942 : : /* leftpage, lopaque must not be used below here */
1943 : :
6213 tgl@sss.pgh.pa.us 1944 : 10590 : MarkBufferDirty(buf);
1945 : 10590 : MarkBufferDirty(rbuf);
1946 : :
1462 pg@bowt.ie 1947 [ + + ]: 10590 : if (!isrightmost)
1948 : : {
4977 tgl@sss.pgh.pa.us 1949 : 4505 : sopaque->btpo_prev = rightpagenumber;
6213 1950 : 4505 : MarkBufferDirty(sbuf);
1951 : : }
1952 : :
1953 : : /*
1954 : : * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1955 : : * a split
1956 : : */
3680 heikki.linnakangas@i 1957 [ + + ]: 10590 : if (!isleaf)
1958 : : {
2916 kgrittn@postgresql.o 1959 : 129 : Page cpage = BufferGetPage(cbuf);
744 michael@paquier.xyz 1960 : 129 : BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1961 : :
3680 heikki.linnakangas@i 1962 : 129 : cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1963 : 129 : MarkBufferDirty(cbuf);
1964 : : }
1965 : :
1966 : : /* XLOG stuff */
4871 rhaas@postgresql.org 1967 [ + + + + : 10590 : if (RelationNeedsWAL(rel))
+ + + - ]
1968 : : {
1969 : : xl_btree_split xlrec;
1970 : : uint8 xlinfo;
1971 : : XLogRecPtr recptr;
1972 : :
1145 pg@bowt.ie 1973 : 9759 : xlrec.level = ropaque->btpo_level;
1974 : : /* See comments below on newitem, orignewitem, and posting lists */
1462 1975 : 9759 : xlrec.firstrightoff = firstrightoff;
3433 heikki.linnakangas@i 1976 : 9759 : xlrec.newitemoff = newitemoff;
1509 pg@bowt.ie 1977 : 9759 : xlrec.postingoff = 0;
1462 1978 [ + + + + ]: 9759 : if (postingoff != 0 && origpagepostingoff < firstrightoff)
1509 1979 : 17 : xlrec.postingoff = postingoff;
1980 : :
3433 heikki.linnakangas@i 1981 : 9759 : XLogBeginInsert();
1982 : 9759 : XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
1983 : :
1984 : 9759 : XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1985 : 9759 : XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
1986 : : /* Log original right sibling, since we've changed its prev-pointer */
1462 pg@bowt.ie 1987 [ + + ]: 9759 : if (!isrightmost)
3433 heikki.linnakangas@i 1988 : 4499 : XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
1462 pg@bowt.ie 1989 [ + + ]: 9759 : if (!isleaf)
3433 heikki.linnakangas@i 1990 : 129 : XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
1991 : :
1992 : : /*
1993 : : * Log the new item, if it was inserted on the left page. (If it was
1994 : : * put on the right page, we don't need to explicitly WAL log it
1995 : : * because it's included with all the other items on the right page.)
1996 : : * Show the new item as belonging to the left page buffer, so that it
1997 : : * is not stored if XLogInsert decides it needs a full-page image of
1998 : : * the left page. We always store newitemoff in the record, though.
1999 : : *
2000 : : * The details are sometimes slightly different for page splits that
2001 : : * coincide with a posting list split. If both the replacement
2002 : : * posting list and newitem go on the right page, then we don't need
2003 : : * to log anything extra, just like the simple !newitemonleft
2004 : : * no-posting-split case (postingoff is set to zero in the WAL record,
2005 : : * so recovery doesn't need to process a posting list split at all).
2006 : : * Otherwise, we set postingoff and log orignewitem instead of
2007 : : * newitem, despite having actually inserted newitem. REDO routine
2008 : : * must reconstruct nposting and newitem using _bt_swap_posting().
2009 : : *
2010 : : * Note: It's possible that our page split point is the point that
2011 : : * makes the posting list lastleft and newitem firstright. This is
2012 : : * the only case where we log orignewitem/newitem despite newitem
2013 : : * going on the right page. If XLogInsert decides that it can omit
2014 : : * orignewitem due to logging a full-page image of the left page,
2015 : : * everything still works out, since recovery only needs to log
2016 : : * orignewitem for items on the left page (just like the regular
2017 : : * newitem-logged case).
2018 : : */
1509 pg@bowt.ie 2019 [ + + + + ]: 9759 : if (newitemonleft && xlrec.postingoff == 0)
1462 2020 : 1610 : XLogRegisterBufData(0, (char *) newitem, newitemsz);
1509 2021 [ + + ]: 8149 : else if (xlrec.postingoff != 0)
2022 : : {
1462 2023 [ - + ]: 17 : Assert(isleaf);
2024 [ + + - + ]: 17 : Assert(newitemonleft || firstrightoff == newitemoff);
2025 [ - + ]: 17 : Assert(newitemsz == IndexTupleSize(orignewitem));
2026 : 17 : XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
2027 : : }
2028 : :
2029 : : /* Log the left page's new high key */
2030 [ + + ]: 9759 : if (!isleaf)
2031 : : {
2032 : : /* lefthighkey isn't local copy, get current pointer */
2033 : 129 : itemid = PageGetItemId(origpage, P_HIKEY);
2034 : 129 : lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2035 : : }
2036 : 9759 : XLogRegisterBufData(0, (char *) lefthighkey,
2037 : 9759 : MAXALIGN(IndexTupleSize(lefthighkey)));
2038 : :
2039 : : /*
2040 : : * Log the contents of the right page in the format understood by
2041 : : * _bt_restore_page(). The whole right page will be recreated.
2042 : : *
2043 : : * Direct access to page is not good but faster - we should implement
2044 : : * some new func in page API. Note we only store the tuples
2045 : : * themselves, knowing that they were inserted in item-number order
2046 : : * and so the line pointers can be reconstructed. See comments for
2047 : : * _bt_restore_page().
2048 : : */
3433 heikki.linnakangas@i 2049 : 9759 : XLogRegisterBufData(1,
2489 tgl@sss.pgh.pa.us 2050 : 9759 : (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
3433 heikki.linnakangas@i 2051 : 9759 : ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2052 : :
1852 pg@bowt.ie 2053 [ + + ]: 9759 : xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
3433 heikki.linnakangas@i 2054 : 9759 : recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2055 : :
6275 alvherre@alvh.no-ip. 2056 : 9759 : PageSetLSN(origpage, recptr);
8593 vadim4o@yahoo.com 2057 : 9759 : PageSetLSN(rightpage, recptr);
1462 pg@bowt.ie 2058 [ + + ]: 9759 : if (!isrightmost)
8593 vadim4o@yahoo.com 2059 : 4499 : PageSetLSN(spage, recptr);
3666 heikki.linnakangas@i 2060 [ + + ]: 9759 : if (!isleaf)
2916 kgrittn@postgresql.o 2061 : 129 : PageSetLSN(BufferGetPage(cbuf), recptr);
2062 : : }
2063 : :
8482 tgl@sss.pgh.pa.us 2064 [ - + ]: 10590 : END_CRIT_SECTION();
2065 : :
2066 : : /* release the old right sibling */
1462 pg@bowt.ie 2067 [ + + ]: 10590 : if (!isrightmost)
6589 tgl@sss.pgh.pa.us 2068 : 4505 : _bt_relbuf(rel, sbuf);
2069 : :
2070 : : /* release the child */
3680 heikki.linnakangas@i 2071 [ + + ]: 10590 : if (!isleaf)
2072 : 129 : _bt_relbuf(rel, cbuf);
2073 : :
2074 : : /* be tidy */
1462 pg@bowt.ie 2075 [ + + ]: 10590 : if (isleaf)
2076 : 10461 : pfree(lefthighkey);
2077 : :
2078 : : /* split's done */
9357 bruce@momjian.us 2079 : 10590 : return rbuf;
2080 : : }
2081 : :
2082 : : /*
2083 : : * _bt_insert_parent() -- Insert downlink into parent, completing split.
2084 : : *
2085 : : * On entry, buf and rbuf are the left and right split pages, which we
2086 : : * still hold write locks on. Both locks will be released here. We
2087 : : * release the rbuf lock once we have a write lock on the page that we
2088 : : * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2089 : : * The lock on buf is released at the same point as the lock on the parent
2090 : : * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2091 : : * atomic operation that completes the split by inserting a new downlink.
2092 : : *
2093 : : * stack - stack showing how we got here. Will be NULL when splitting true
2094 : : * root, or during concurrent root split, where we can be inefficient
2095 : : * isroot - we split the true root
2096 : : * isonly - we split a page alone on its level (might have been fast root)
2097 : : */
2098 : : static void
7723 tgl@sss.pgh.pa.us 2099 : 10590 : _bt_insert_parent(Relation rel,
2100 : : Relation heaprel,
2101 : : Buffer buf,
2102 : : Buffer rbuf,
2103 : : BTStack stack,
2104 : : bool isroot,
2105 : : bool isonly)
2106 : : {
309 pg@bowt.ie 2107 [ - + ]: 10590 : Assert(heaprel != NULL);
2108 : :
2109 : : /*
2110 : : * Here we have to do something Lehman and Yao don't talk about: deal with
2111 : : * a root split and construction of a new root. If our stack is empty
2112 : : * then we have just split a node on what had been the root level when we
2113 : : * descended the tree. If it was still the root then we perform a
2114 : : * new-root construction. If it *wasn't* the root anymore, search to find
2115 : : * the next higher level that someone constructed meanwhile, and find the
2116 : : * right place to insert as for the normal case.
2117 : : *
2118 : : * If we have to search for the parent level, we do so by re-descending
2119 : : * from the root. This is not super-efficient, but it's rare enough not
2120 : : * to matter.
2121 : : */
1244 2122 [ + + ]: 10590 : if (isroot)
2123 : : {
2124 : : Buffer rootbuf;
2125 : :
7403 neilc@samurai.com 2126 [ - + ]: 495 : Assert(stack == NULL);
1244 pg@bowt.ie 2127 [ - + ]: 495 : Assert(isonly);
2128 : : /* create a new root node one level up and update the metapage */
309 2129 : 495 : rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2130 : : /* release the split buffers */
6589 tgl@sss.pgh.pa.us 2131 : 495 : _bt_relbuf(rel, rootbuf);
2132 : 495 : _bt_relbuf(rel, rbuf);
2133 : 495 : _bt_relbuf(rel, buf);
2134 : : }
2135 : : else
2136 : : {
7723 2137 : 10095 : BlockNumber bknum = BufferGetBlockNumber(buf);
2138 : 10095 : BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2916 kgrittn@postgresql.o 2139 : 10095 : Page page = BufferGetPage(buf);
2140 : : IndexTuple new_item;
2141 : : BTStackData fakestack;
2142 : : IndexTuple ritem;
2143 : : Buffer pbuf;
2144 : :
7403 neilc@samurai.com 2145 [ + + ]: 10095 : if (stack == NULL)
2146 : : {
2147 : : BTPageOpaque opaque;
2148 : :
3503 heikki.linnakangas@i 2149 [ - + ]: 12 : elog(DEBUG2, "concurrent ROOT page split");
744 michael@paquier.xyz 2150 : 12 : opaque = BTPageGetOpaque(page);
2151 : :
2152 : : /*
2153 : : * We should never reach here when a leaf page split takes place
2154 : : * despite the insert of newitem being able to apply the fastpath
2155 : : * optimization. Make sure of that with an assertion.
2156 : : *
2157 : : * This is more of a performance issue than a correctness issue.
2158 : : * The fastpath won't have a descent stack. Using a phony stack
2159 : : * here works, but never rely on that. The fastpath should be
2160 : : * rejected within _bt_search_insert() when the rightmost leaf
2161 : : * page will split, since it's faster to go through _bt_search()
2162 : : * and get a stack in the usual way.
2163 : : */
1244 pg@bowt.ie 2164 [ + - + - : 12 : Assert(!(P_ISLEAF(opaque) &&
- + ]
2165 : : BlockNumberIsValid(RelationGetTargetBlock(rel))));
2166 : :
2167 : : /* Find the leftmost page at the next level up */
219 tmunro@postgresql.or 2168 :GNC 12 : pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2169 : : /* Set up a phony stack entry pointing there */
7723 tgl@sss.pgh.pa.us 2170 :CBC 12 : stack = &fakestack;
2171 : 12 : stack->bts_blkno = BufferGetBlockNumber(pbuf);
2172 : 12 : stack->bts_offset = InvalidOffsetNumber;
2173 : 12 : stack->bts_parent = NULL;
2174 : 12 : _bt_relbuf(rel, pbuf);
2175 : : }
2176 : :
2177 : : /* get high key from left, a strict lower bound for new right page */
6654 2178 : 10095 : ritem = (IndexTuple) PageGetItem(page,
2179 : : PageGetItemId(page, P_HIKEY));
2180 : :
2181 : : /* form an index tuple that points at the new right page */
2182 : 10095 : new_item = CopyIndexTuple(ritem);
1581 pg@bowt.ie 2183 : 10095 : BTreeTupleSetDownLink(new_item, rbknum);
2184 : :
2185 : : /*
2186 : : * Re-find and write lock the parent of buf.
2187 : : *
2188 : : * It's possible that the location of buf's downlink has changed since
2189 : : * our initial _bt_search() descent. _bt_getstackbuf() will detect
2190 : : * and recover from this, updating the stack, which ensures that the
2191 : : * new downlink will be inserted at the correct offset. Even buf's
2192 : : * parent may have changed.
2193 : : */
379 andres@anarazel.de 2194 : 10095 : pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2195 : :
2196 : : /*
2197 : : * Unlock the right child. The left child will be unlocked in
2198 : : * _bt_insertonpg().
2199 : : *
2200 : : * Unlocking the right child must be delayed until here to ensure that
2201 : : * no concurrent VACUUM operation can become confused. Page deletion
2202 : : * cannot be allowed to fail to re-find a downlink for the rbuf page.
2203 : : * (Actually, this is just a vestige of how things used to work. The
2204 : : * page deletion code is expected to check for the INCOMPLETE_SPLIT
2205 : : * flag on the left child. It won't attempt deletion of the right
2206 : : * child until the split is complete. Despite all this, we opt to
2207 : : * conservatively delay unlocking the right child until here.)
2208 : : */
6589 tgl@sss.pgh.pa.us 2209 : 10095 : _bt_relbuf(rel, rbuf);
2210 : :
7723 2211 [ - + ]: 10095 : if (pbuf == InvalidBuffer)
1718 peter@eisentraut.org 2212 [ # # ]:UBC 0 : ereport(ERROR,
2213 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2214 : : errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2215 : : RelationGetRelationName(rel), bknum, rbknum)));
2216 : :
2217 : : /* Recursively insert into the parent */
379 andres@anarazel.de 2218 :CBC 10095 : _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
1490 pg@bowt.ie 2219 : 10095 : new_item, MAXALIGN(IndexTupleSize(new_item)),
1244 2220 : 10095 : stack->bts_offset + 1, 0, isonly);
2221 : :
2222 : : /* be tidy */
7723 tgl@sss.pgh.pa.us 2223 : 10095 : pfree(new_item);
2224 : : }
2225 : 10590 : }
2226 : :
2227 : : /*
2228 : : * _bt_finish_split() -- Finish an incomplete split
2229 : : *
2230 : : * A crash or other failure can leave a split incomplete. The insertion
2231 : : * routines won't allow to insert on a page that is incompletely split.
2232 : : * Before inserting on such a page, call _bt_finish_split().
2233 : : *
2234 : : * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked
2235 : : * and unpinned.
2236 : : *
2237 : : * Caller must provide a valid heaprel, since finishing a page split requires
2238 : : * allocating a new page if and when the parent page splits in turn.
2239 : : */
2240 : : void
379 andres@anarazel.de 2241 :UBC 0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2242 : : {
2916 kgrittn@postgresql.o 2243 : 0 : Page lpage = BufferGetPage(lbuf);
744 michael@paquier.xyz 2244 : 0 : BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2245 : : Buffer rbuf;
2246 : : Page rpage;
2247 : : BTPageOpaque rpageop;
2248 : : bool wasroot;
2249 : : bool wasonly;
2250 : :
3680 heikki.linnakangas@i 2251 [ # # ]: 0 : Assert(P_INCOMPLETE_SPLIT(lpageop));
309 pg@bowt.ie 2252 [ # # ]: 0 : Assert(heaprel != NULL);
2253 : :
2254 : : /* Lock right sibling, the one missing the downlink */
2255 : 0 : rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2916 kgrittn@postgresql.o 2256 : 0 : rpage = BufferGetPage(rbuf);
744 michael@paquier.xyz 2257 : 0 : rpageop = BTPageGetOpaque(rpage);
2258 : :
2259 : : /* Could this be a root split? */
3680 heikki.linnakangas@i 2260 [ # # ]: 0 : if (!stack)
2261 : : {
2262 : : Buffer metabuf;
2263 : : Page metapg;
2264 : : BTMetaPageData *metad;
2265 : :
2266 : : /* acquire lock on the metapage */
309 pg@bowt.ie 2267 : 0 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2916 kgrittn@postgresql.o 2268 : 0 : metapg = BufferGetPage(metabuf);
3680 heikki.linnakangas@i 2269 : 0 : metad = BTPageGetMeta(metapg);
2270 : :
1244 pg@bowt.ie 2271 : 0 : wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2272 : :
3680 heikki.linnakangas@i 2273 : 0 : _bt_relbuf(rel, metabuf);
2274 : : }
2275 : : else
1244 pg@bowt.ie 2276 : 0 : wasroot = false;
2277 : :
2278 : : /* Was this the only page on the level before split? */
2279 [ # # # # ]: 0 : wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2280 : :
3680 heikki.linnakangas@i 2281 [ # # ]: 0 : elog(DEBUG1, "finishing incomplete split of %u/%u",
2282 : : BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2283 : :
379 andres@anarazel.de 2284 : 0 : _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
3680 heikki.linnakangas@i 2285 : 0 : }
2286 : :
2287 : : /*
2288 : : * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2289 : : * tuple whose downlink points to child page.
2290 : : *
2291 : : * Caller passes child's block number, which is used to identify
2292 : : * associated pivot tuple in parent page using a linear search that
2293 : : * matches on pivot's downlink/block number. The expected location of
2294 : : * the pivot tuple is taken from the stack one level above the child
2295 : : * page. This is used as a starting point. Insertions into the
2296 : : * parent level could cause the pivot tuple to move right; deletions
2297 : : * could cause it to move left, but not left of the page we previously
2298 : : * found it on.
2299 : : *
2300 : : * Caller can use its stack to relocate the pivot tuple/downlink for
2301 : : * any same-level page to the right of the page found by its initial
2302 : : * descent. This is necessary because of the possibility that caller
2303 : : * moved right to recover from a concurrent page split. It's also
2304 : : * convenient for certain callers to be able to step right when there
2305 : : * wasn't a concurrent page split, while still using their original
2306 : : * stack. For example, the checkingunique _bt_doinsert() case may
2307 : : * have to step right when there are many physical duplicates, and its
2308 : : * scantid forces an insertion to the right of the "first page the
2309 : : * value could be on". (This is also relied on by all of our callers
2310 : : * when dealing with !heapkeyspace indexes.)
2311 : : *
2312 : : * Returns write-locked parent page buffer, or InvalidBuffer if pivot
2313 : : * tuple not found (should not happen). Adjusts bts_blkno &
2314 : : * bts_offset if changed. Page split caller should insert its new
2315 : : * pivot tuple for its new right sibling page on parent page, at the
2316 : : * offset number bts_offset + 1.
2317 : : */
2318 : : Buffer
379 andres@anarazel.de 2319 :CBC 13101 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2320 : : {
2321 : : BlockNumber blkno;
2322 : : OffsetNumber start;
2323 : :
8668 tgl@sss.pgh.pa.us 2324 : 13101 : blkno = stack->bts_blkno;
2325 : 13101 : start = stack->bts_offset;
2326 : :
2327 : : for (;;)
2328 : 11 : {
2329 : : Buffer buf;
2330 : : Page page;
2331 : : BTPageOpaque opaque;
2332 : :
309 pg@bowt.ie 2333 : 13112 : buf = _bt_getbuf(rel, blkno, BT_WRITE);
2916 kgrittn@postgresql.o 2334 : 13112 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2335 : 13112 : opaque = BTPageGetOpaque(page);
2336 : :
309 pg@bowt.ie 2337 [ - + ]: 13112 : Assert(heaprel != NULL);
1875 2338 [ - + ]: 13112 : if (P_INCOMPLETE_SPLIT(opaque))
2339 : : {
379 andres@anarazel.de 2340 :UBC 0 : _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
3680 heikki.linnakangas@i 2341 : 0 : continue;
2342 : : }
2343 : :
7722 tgl@sss.pgh.pa.us 2344 [ + + ]:CBC 13112 : if (!P_IGNORE(opaque))
2345 : : {
2346 : : OffsetNumber offnum,
2347 : : minoff,
2348 : : maxoff;
2349 : : ItemId itemid;
2350 : : IndexTuple item;
2351 : :
2352 [ + + ]: 13101 : minoff = P_FIRSTDATAKEY(opaque);
2353 : 13101 : maxoff = PageGetMaxOffsetNumber(page);
2354 : :
2355 : : /*
2356 : : * start = InvalidOffsetNumber means "search the whole page". We
2357 : : * need this test anyway due to possibility that page has a high
2358 : : * key now when it didn't before.
2359 : : */
2360 [ + + ]: 13101 : if (start < minoff)
2361 : 23 : start = minoff;
2362 : :
2363 : : /*
2364 : : * Need this check too, to guard against possibility that page
2365 : : * split since we visited it originally.
2366 : : */
7180 2367 [ - + ]: 13101 : if (start > maxoff)
7180 tgl@sss.pgh.pa.us 2368 :UBC 0 : start = OffsetNumberNext(maxoff);
2369 : :
2370 : : /*
2371 : : * These loops will check every item on the page --- but in an
2372 : : * order that's attuned to the probability of where it actually
2373 : : * is. Scan to the right first, then to the left.
2374 : : */
7722 tgl@sss.pgh.pa.us 2375 :CBC 13101 : for (offnum = start;
2376 [ + - ]: 13139 : offnum <= maxoff;
2377 : 38 : offnum = OffsetNumberNext(offnum))
2378 : : {
2379 : 13139 : itemid = PageGetItemId(page, offnum);
6654 2380 : 13139 : item = (IndexTuple) PageGetItem(page, itemid);
2381 : :
1581 pg@bowt.ie 2382 [ + + ]: 13139 : if (BTreeTupleGetDownLink(item) == child)
2383 : : {
2384 : : /* Return accurate pointer to where link is now */
7722 tgl@sss.pgh.pa.us 2385 : 13101 : stack->bts_blkno = blkno;
2386 : 13101 : stack->bts_offset = offnum;
2387 : 13101 : return buf;
2388 : : }
2389 : : }
2390 : :
7722 tgl@sss.pgh.pa.us 2391 :UBC 0 : for (offnum = OffsetNumberPrev(start);
2392 [ # # ]: 0 : offnum >= minoff;
2393 : 0 : offnum = OffsetNumberPrev(offnum))
2394 : : {
2395 : 0 : itemid = PageGetItemId(page, offnum);
6654 2396 : 0 : item = (IndexTuple) PageGetItem(page, itemid);
2397 : :
1581 pg@bowt.ie 2398 [ # # ]: 0 : if (BTreeTupleGetDownLink(item) == child)
2399 : : {
2400 : : /* Return accurate pointer to where link is now */
7722 tgl@sss.pgh.pa.us 2401 : 0 : stack->bts_blkno = blkno;
2402 : 0 : stack->bts_offset = offnum;
2403 : 0 : return buf;
2404 : : }
2405 : : }
2406 : : }
2407 : :
2408 : : /*
2409 : : * The item we're looking for moved right at least one page.
2410 : : *
2411 : : * Lehman and Yao couple/chain locks when moving right here, which we
2412 : : * can avoid. See nbtree/README.
2413 : : */
8668 tgl@sss.pgh.pa.us 2414 [ - + ]:CBC 11 : if (P_RIGHTMOST(opaque))
2415 : : {
8309 tgl@sss.pgh.pa.us 2416 :UBC 0 : _bt_relbuf(rel, buf);
7298 2417 : 0 : return InvalidBuffer;
2418 : : }
8668 tgl@sss.pgh.pa.us 2419 :CBC 11 : blkno = opaque->btpo_next;
7723 2420 : 11 : start = InvalidOffsetNumber;
8309 2421 : 11 : _bt_relbuf(rel, buf);
2422 : : }
2423 : : }
2424 : :
2425 : : /*
2426 : : * _bt_newlevel() -- Create a new level above root page.
2427 : : *
2428 : : * We've just split the old root page and need to create a new one.
2429 : : * In order to do this, we add a new root page to the file, then lock
2430 : : * the metadata page and update it. This is guaranteed to be deadlock-
2431 : : * free, because all readers release their locks on the metadata page
2432 : : * before trying to lock the root, and all writers lock the root before
2433 : : * trying to lock the metadata page. We have a write lock on the old
2434 : : * root page, so we have not introduced any cycles into the waits-for
2435 : : * graph.
2436 : : *
2437 : : * On entry, lbuf (the old root) and rbuf (its new peer) are write-
2438 : : * locked. On exit, a new root page exists with entries for the
2439 : : * two new children, metapage is updated and unlocked/unpinned.
2440 : : * The new root buffer is returned to caller which has to unlock/unpin
2441 : : * lbuf, rbuf & rootbuf.
2442 : : */
2443 : : static Buffer
309 pg@bowt.ie 2444 : 495 : _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2445 : : {
2446 : : Buffer rootbuf;
2447 : : Page lpage,
2448 : : rootpage;
2449 : : BlockNumber lbkno,
2450 : : rbkno;
2451 : : BlockNumber rootblknum;
2452 : : BTPageOpaque rootopaque;
2453 : : BTPageOpaque lopaque;
2454 : : ItemId itemid;
2455 : : IndexTuple item;
2456 : : IndexTuple left_item;
2457 : : Size left_item_sz;
2458 : : IndexTuple right_item;
2459 : : Size right_item_sz;
2460 : : Buffer metabuf;
2461 : : Page metapg;
2462 : : BTMetaPageData *metad;
2463 : :
7723 tgl@sss.pgh.pa.us 2464 : 495 : lbkno = BufferGetBlockNumber(lbuf);
2465 : 495 : rbkno = BufferGetBlockNumber(rbuf);
2916 kgrittn@postgresql.o 2466 : 495 : lpage = BufferGetPage(lbuf);
744 michael@paquier.xyz 2467 : 495 : lopaque = BTPageGetOpaque(lpage);
2468 : :
2469 : : /* get a new root page */
309 pg@bowt.ie 2470 : 495 : rootbuf = _bt_allocbuf(rel, heaprel);
2916 kgrittn@postgresql.o 2471 : 495 : rootpage = BufferGetPage(rootbuf);
8593 vadim4o@yahoo.com 2472 : 495 : rootblknum = BufferGetBlockNumber(rootbuf);
2473 : :
2474 : : /* acquire lock on the metapage */
309 pg@bowt.ie 2475 : 495 : metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2916 kgrittn@postgresql.o 2476 : 495 : metapg = BufferGetPage(metabuf);
8508 vadim4o@yahoo.com 2477 : 495 : metad = BTPageGetMeta(metapg);
2478 : :
2479 : : /*
2480 : : * Create downlink item for left page (old root). The key value used is
2481 : : * "minus infinity", a sentinel value that's reliably less than any real
2482 : : * key value that could appear in the left page.
2483 : : */
3663 heikki.linnakangas@i 2484 : 495 : left_item_sz = sizeof(IndexTupleData);
2485 : 495 : left_item = (IndexTuple) palloc(left_item_sz);
2486 : 495 : left_item->t_info = left_item_sz;
1581 pg@bowt.ie 2487 : 495 : BTreeTupleSetDownLink(left_item, lbkno);
1468 2488 : 495 : BTreeTupleSetNAtts(left_item, 0, false);
2489 : :
2490 : : /*
2491 : : * Create downlink item for right page. The key for it is obtained from
2492 : : * the "high key" position in the left page.
2493 : : */
3663 heikki.linnakangas@i 2494 : 495 : itemid = PageGetItemId(lpage, P_HIKEY);
2495 : 495 : right_item_sz = ItemIdGetLength(itemid);
2496 : 495 : item = (IndexTuple) PageGetItem(lpage, itemid);
2497 : 495 : right_item = CopyIndexTuple(item);
1581 pg@bowt.ie 2498 : 495 : BTreeTupleSetDownLink(right_item, rbkno);
2499 : :
2500 : : /* NO EREPORT(ERROR) from here till newroot op is logged */
8493 tgl@sss.pgh.pa.us 2501 : 495 : START_CRIT_SECTION();
2502 : :
2503 : : /* upgrade metapage if needed */
1852 pg@bowt.ie 2504 [ - + ]: 495 : if (metad->btm_version < BTREE_NOVAC_VERSION)
2146 teodor@sigaev.ru 2505 :UBC 0 : _bt_upgrademetapage(metapg);
2506 : :
2507 : : /* set btree special data */
744 michael@paquier.xyz 2508 :CBC 495 : rootopaque = BTPageGetOpaque(rootpage);
9716 bruce@momjian.us 2509 : 495 : rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
7723 tgl@sss.pgh.pa.us 2510 : 495 : rootopaque->btpo_flags = BTP_ROOT;
1145 pg@bowt.ie 2511 : 495 : rootopaque->btpo_level =
744 michael@paquier.xyz 2512 : 495 : (BTPageGetOpaque(lpage))->btpo_level + 1;
6551 tgl@sss.pgh.pa.us 2513 : 495 : rootopaque->btpo_cycleid = 0;
2514 : :
2515 : : /* update metapage data */
7723 2516 : 495 : metad->btm_root = rootblknum;
1145 pg@bowt.ie 2517 : 495 : metad->btm_level = rootopaque->btpo_level;
7723 tgl@sss.pgh.pa.us 2518 : 495 : metad->btm_fastroot = rootblknum;
1145 pg@bowt.ie 2519 : 495 : metad->btm_fastlevel = rootopaque->btpo_level;
2520 : :
2521 : : /*
2522 : : * Insert the left page pointer into the new root page. The root page is
2523 : : * the rightmost page on its level so there is no "high key" in it; the
2524 : : * two items will go into positions P_HIKEY and P_FIRSTKEY.
2525 : : *
2526 : : * Note: we *must* insert the two items in item-number order, for the
2527 : : * benefit of _bt_restore_page().
2528 : : */
2187 teodor@sigaev.ru 2529 [ + - - + ]: 495 : Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
3663 heikki.linnakangas@i 2530 [ - + ]: 495 : if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2531 : : false, false) == InvalidOffsetNumber)
5949 tgl@sss.pgh.pa.us 2532 [ # # ]:UBC 0 : elog(PANIC, "failed to add leftkey to new root page"
2533 : : " while splitting block %u of index \"%s\"",
2534 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2535 : :
2536 : : /*
2537 : : * insert the right page pointer into the new root page.
2538 : : */
1852 pg@bowt.ie 2539 [ + - - + ]:CBC 495 : Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2540 [ + - - + ]: 495 : Assert(BTreeTupleGetNAtts(right_item, rel) <=
2541 : : IndexRelationGetNumberOfKeyAttributes(rel));
3663 heikki.linnakangas@i 2542 [ - + ]: 495 : if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2543 : : false, false) == InvalidOffsetNumber)
5949 tgl@sss.pgh.pa.us 2544 [ # # ]:UBC 0 : elog(PANIC, "failed to add rightkey to new root page"
2545 : : " while splitting block %u of index \"%s\"",
2546 : : BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2547 : :
2548 : : /* Clear the incomplete-split flag in the left child */
3680 heikki.linnakangas@i 2549 [ - + ]:CBC 495 : Assert(P_INCOMPLETE_SPLIT(lopaque));
2550 : 495 : lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2551 : 495 : MarkBufferDirty(lbuf);
2552 : :
6589 tgl@sss.pgh.pa.us 2553 : 495 : MarkBufferDirty(rootbuf);
2554 : 495 : MarkBufferDirty(metabuf);
2555 : :
2556 : : /* XLOG stuff */
4871 rhaas@postgresql.org 2557 [ + + + + : 495 : if (RelationNeedsWAL(rel))
+ + + - ]
2558 : : {
2559 : : xl_btree_newroot xlrec;
2560 : : XLogRecPtr recptr;
2561 : : xl_btree_metadata md;
2562 : :
7723 tgl@sss.pgh.pa.us 2563 : 482 : xlrec.rootblk = rootblknum;
8508 vadim4o@yahoo.com 2564 : 482 : xlrec.level = metad->btm_level;
2565 : :
3433 heikki.linnakangas@i 2566 : 482 : XLogBeginInsert();
2567 : 482 : XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
2568 : :
2569 : 482 : XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2570 : 482 : XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2354 tgl@sss.pgh.pa.us 2571 : 482 : XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2572 : :
1852 pg@bowt.ie 2573 [ - + ]: 482 : Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2574 : 482 : md.version = metad->btm_version;
3433 heikki.linnakangas@i 2575 : 482 : md.root = rootblknum;
2576 : 482 : md.level = metad->btm_level;
2577 : 482 : md.fastroot = rootblknum;
2578 : 482 : md.fastlevel = metad->btm_level;
1145 pg@bowt.ie 2579 : 482 : md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1509 2580 : 482 : md.allequalimage = metad->btm_allequalimage;
2581 : :
3433 heikki.linnakangas@i 2582 : 482 : XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
2583 : :
2584 : : /*
2585 : : * Direct access to page is not good but faster - we should implement
2586 : : * some new func in page API.
2587 : : */
2588 : 482 : XLogRegisterBufData(0,
2489 tgl@sss.pgh.pa.us 2589 : 482 : (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
3433 heikki.linnakangas@i 2590 : 482 : ((PageHeader) rootpage)->pd_special -
2591 : 482 : ((PageHeader) rootpage)->pd_upper);
2592 : :
2593 : 482 : recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2594 : :
3645 2595 : 482 : PageSetLSN(lpage, recptr);
8593 vadim4o@yahoo.com 2596 : 482 : PageSetLSN(rootpage, recptr);
8584 2597 : 482 : PageSetLSN(metapg, recptr);
2598 : : }
2599 : :
8493 tgl@sss.pgh.pa.us 2600 [ - + ]: 495 : END_CRIT_SECTION();
2601 : :
2602 : : /* done with metapage */
6589 2603 : 495 : _bt_relbuf(rel, metabuf);
2604 : :
3663 heikki.linnakangas@i 2605 : 495 : pfree(left_item);
2606 : 495 : pfree(right_item);
2607 : :
6668 neilc@samurai.com 2608 : 495 : return rootbuf;
2609 : : }
2610 : :
2611 : : /*
2612 : : * _bt_pgaddtup() -- add a data item to a particular page during split.
2613 : : *
2614 : : * The difference between this routine and a bare PageAddItem call is
2615 : : * that this code can deal with the first data item on an internal btree
2616 : : * page in passing. This data item (which is called "firstright" within
2617 : : * _bt_split()) has a key that must be treated as minus infinity after
2618 : : * the split. Therefore, we truncate away all attributes when caller
2619 : : * specifies it's the first data item on page (downlink is not changed,
2620 : : * though). This extra step is only needed for the right page of an
2621 : : * internal page split. There is no need to do this for the first data
2622 : : * item on the existing/left page, since that will already have been
2623 : : * truncated during an earlier page split.
2624 : : *
2625 : : * See _bt_split() for a high level explanation of why we truncate here.
2626 : : * Note that this routine has nothing to do with suffix truncation,
2627 : : * despite using some of the same infrastructure.
2628 : : */
2629 : : static inline bool
4977 tgl@sss.pgh.pa.us 2630 : 3275633 : _bt_pgaddtup(Page page,
2631 : : Size itemsize,
2632 : : IndexTuple itup,
2633 : : OffsetNumber itup_off,
2634 : : bool newfirstdataitem)
2635 : : {
2636 : : IndexTupleData trunctuple;
2637 : :
1462 pg@bowt.ie 2638 [ + + ]: 3275633 : if (newfirstdataitem)
2639 : : {
6654 tgl@sss.pgh.pa.us 2640 : 129 : trunctuple = *itup;
2641 : 129 : trunctuple.t_info = sizeof(IndexTupleData);
1468 pg@bowt.ie 2642 : 129 : BTreeTupleSetNAtts(&trunctuple, 0, false);
6654 tgl@sss.pgh.pa.us 2643 : 129 : itup = &trunctuple;
2644 : 129 : itemsize = sizeof(IndexTupleData);
2645 : : }
2646 : :
1462 pg@bowt.ie 2647 [ - + ]: 3275633 : if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2648 : : false) == InvalidOffsetNumber))
4977 tgl@sss.pgh.pa.us 2649 :UBC 0 : return false;
2650 : :
4977 tgl@sss.pgh.pa.us 2651 :CBC 3275633 : return true;
2652 : : }
2653 : :
2654 : : /*
2655 : : * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2656 : : *
2657 : : * There are three operations performed here: simple index deletion, bottom-up
2658 : : * index deletion, and deduplication. If all three operations fail to free
2659 : : * enough space for the incoming item then caller will go on to split the
2660 : : * page. We always consider simple deletion first. If that doesn't work out
2661 : : * we consider alternatives. Callers that only want us to consider simple
2662 : : * deletion (without any fallback) ask for that using the 'simpleonly'
2663 : : * argument.
2664 : : *
2665 : : * We usually pick only one alternative "complex" operation when simple
2666 : : * deletion alone won't prevent a page split. The 'checkingunique',
2667 : : * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2668 : : *
2669 : : * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2670 : : * level flag was found set. The flag was useful back when there wasn't
2671 : : * necessarily one single page for a duplicate tuple to go on (before heap TID
2672 : : * became a part of the key space in version 4 indexes). But we don't
2673 : : * actually look at the flag anymore (it's not a gating condition for our
2674 : : * caller). That would cause us to miss tuples that are safe to delete,
2675 : : * without getting any benefit in return. We know that the alternative is to
2676 : : * split the page; scanning the line pointer array in passing won't have
2677 : : * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite
2678 : : * all this because !heapkeyspace indexes must still do a "getting tired"
2679 : : * linear search, and so are likely to get some benefit from using it as a
2680 : : * gating condition.)
2681 : : */
2682 : : static void
1244 pg@bowt.ie 2683 : 22756 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2684 : : BTInsertState insertstate,
2685 : : bool simpleonly, bool checkingunique,
2686 : : bool uniquedup, bool indexUnchanged)
2687 : : {
2688 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
6402 bruce@momjian.us 2689 : 22756 : int ndeletable = 0;
2690 : : OffsetNumber offnum,
2691 : : minoff,
2692 : : maxoff;
1244 pg@bowt.ie 2693 : 22756 : Buffer buffer = insertstate->buf;
2694 : 22756 : BTScanInsert itup_key = insertstate->itup_key;
2916 kgrittn@postgresql.o 2695 : 22756 : Page page = BufferGetPage(buffer);
744 michael@paquier.xyz 2696 : 22756 : BTPageOpaque opaque = BTPageGetOpaque(page);
2697 : :
1852 pg@bowt.ie 2698 [ - + ]: 22756 : Assert(P_ISLEAF(opaque));
1187 2699 [ + - - + ]: 22756 : Assert(simpleonly || itup_key->heapkeyspace);
2700 [ - + - - : 22756 : Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
- - - - ]
2701 : :
2702 : : /*
2703 : : * Scan over all items to see which ones need to be deleted according to
2704 : : * LP_DEAD flags. We'll usually manage to delete a few extra items that
2705 : : * are not marked LP_DEAD in passing. Often the extra items that actually
2706 : : * end up getting deleted are items that would have had their LP_DEAD bit
2707 : : * set before long anyway (if we opted not to include them as extras).
2708 : : */
2709 [ + + ]: 22756 : minoff = P_FIRSTDATAKEY(opaque);
6473 tgl@sss.pgh.pa.us 2710 : 22756 : maxoff = PageGetMaxOffsetNumber(page);
1187 pg@bowt.ie 2711 : 22756 : for (offnum = minoff;
6473 tgl@sss.pgh.pa.us 2712 [ + + ]: 6184295 : offnum <= maxoff;
2713 : 6161539 : offnum = OffsetNumberNext(offnum))
2714 : : {
6402 bruce@momjian.us 2715 : 6161539 : ItemId itemId = PageGetItemId(page, offnum);
2716 : :
6059 tgl@sss.pgh.pa.us 2717 [ + + ]: 6161539 : if (ItemIdIsDead(itemId))
6473 2718 : 120976 : deletable[ndeletable++] = offnum;
2719 : : }
2720 : :
2721 [ + + ]: 22756 : if (ndeletable > 0)
2722 : : {
1187 pg@bowt.ie 2723 : 3256 : _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2724 : : insertstate->itup, minoff, maxoff);
1244 2725 : 3256 : insertstate->bounds_valid = false;
2726 : :
2727 : : /* Return when a page split has already been avoided */
2728 [ + + ]: 3256 : if (PageGetFreeSpace(page) >= insertstate->itemsz)
2729 : 10978 : return;
2730 : :
2731 : : /* Might as well assume duplicates (if checkingunique) */
2732 : 39 : uniquedup = true;
2733 : : }
2734 : :
2735 : : /*
2736 : : * We're done with simple deletion. Return early with callers that only
2737 : : * call here so that simple deletion can be considered. This includes
2738 : : * callers that explicitly ask for this and checkingunique callers that
2739 : : * probably don't have any version churn duplicates on the page.
2740 : : *
2741 : : * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2742 : : * return at this point (or when we go on the try either or both of our
2743 : : * other strategies and they also fail). We do not bother expending a
2744 : : * separate write to clear it, however. Caller will definitely clear it
2745 : : * when it goes on to split the page (note also that the deduplication
2746 : : * process will clear the flag in passing, just to keep things tidy).
2747 : : */
1187 2748 [ + - + + : 19539 : if (simpleonly || (checkingunique && !uniquedup))
+ + ]
2749 : : {
2750 [ - + ]: 7602 : Assert(!indexUnchanged);
1244 2751 : 7602 : return;
2752 : : }
2753 : :
2754 : : /* Assume bounds about to be invalidated (this is almost certain now) */
2755 : 11937 : insertstate->bounds_valid = false;
2756 : :
2757 : : /*
2758 : : * Perform bottom-up index deletion pass when executor hint indicated that
2759 : : * incoming item is logically unchanged, or for a unique index that is
2760 : : * known to have physical duplicates for some other reason. (There is a
2761 : : * large overlap between these two cases for a unique index. It's worth
2762 : : * having both triggering conditions in order to apply the optimization in
2763 : : * the event of successive related INSERT and DELETE statements.)
2764 : : *
2765 : : * We'll go on to do a deduplication pass when a bottom-up pass fails to
2766 : : * delete an acceptable amount of free space (a significant fraction of
2767 : : * the page, or space for the new item, whichever is greater).
2768 : : *
2769 : : * Note: Bottom-up index deletion uses the same equality/equivalence
2770 : : * routines as deduplication internally. However, it does not merge
2771 : : * together index tuples, so the same correctness considerations do not
2772 : : * apply. We deliberately omit an index-is-allequalimage test here.
2773 : : */
1187 2774 [ + + + + : 13666 : if ((indexUnchanged || uniquedup) &&
+ + ]
2775 : 1729 : _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2776 : 159 : return;
2777 : :
2778 : : /* Perform deduplication pass (when enabled and index-is-allequalimage) */
1244 2779 [ + - - + : 11778 : if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
+ + + + +
+ + - ]
362 2780 : 11769 : _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2781 [ + + + + ]: 11769 : (indexUnchanged || uniquedup));
2782 : : }
2783 : :
2784 : : /*
2785 : : * _bt_simpledel_pass - Simple index tuple deletion pass.
2786 : : *
2787 : : * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers
2788 : : * of all such tuples are determined by caller (caller passes these to us as
2789 : : * its 'deletable' argument).
2790 : : *
2791 : : * We might also delete extra index tuples that turn out to be safe to delete
2792 : : * in passing (though they must be cheap to check in passing to begin with).
2793 : : * There is no certainty that any extra tuples will be deleted, though. The
2794 : : * high level goal of the approach we take is to get the most out of each call
2795 : : * here (without noticeably increasing the per-call overhead compared to what
2796 : : * we need to do just to be able to delete the page's LP_DEAD-marked index
2797 : : * tuples).
2798 : : *
2799 : : * The number of extra index tuples that turn out to be deletable might
2800 : : * greatly exceed the number of LP_DEAD-marked index tuples due to various
2801 : : * locality related effects. For example, it's possible that the total number
2802 : : * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2803 : : * quite low, in which case we might end up checking if it's possible to
2804 : : * delete _most_ index tuples on the page (without the tableam needing to
2805 : : * access additional table blocks). The tableam will sometimes stumble upon
2806 : : * _many_ extra deletable index tuples in indexes where this pattern is
2807 : : * common.
2808 : : *
2809 : : * See nbtree/README for further details on simple index tuple deletion.
2810 : : */
2811 : : static void
1187 2812 : 3256 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2813 : : OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2814 : : OffsetNumber minoff, OffsetNumber maxoff)
2815 : : {
2816 : 3256 : Page page = BufferGetPage(buffer);
2817 : : BlockNumber *deadblocks;
2818 : : int ndeadblocks;
2819 : : TM_IndexDeleteOp delstate;
2820 : : OffsetNumber offnum;
2821 : :
2822 : : /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2823 : 3256 : deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2824 : : &ndeadblocks);
2825 : :
2826 : : /* Initialize tableam state that describes index deletion operation */
892 2827 : 3256 : delstate.irel = rel;
2828 : 3256 : delstate.iblknum = BufferGetBlockNumber(buffer);
1187 2829 : 3256 : delstate.bottomup = false;
2830 : 3256 : delstate.bottomupfreespace = 0;
2831 : 3256 : delstate.ndeltids = 0;
2832 : 3256 : delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2833 : 3256 : delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2834 : :
2835 : 3256 : for (offnum = minoff;
2836 [ + + ]: 924253 : offnum <= maxoff;
2837 : 920997 : offnum = OffsetNumberNext(offnum))
2838 : : {
2839 : 920997 : ItemId itemid = PageGetItemId(page, offnum);
2840 : 920997 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2841 : 920997 : TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2842 : 920997 : TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2843 : : BlockNumber tidblock;
2844 : : void *match;
2845 : :
2846 [ + + ]: 920997 : if (!BTreeTupleIsPosting(itup))
2847 : : {
2848 : 882717 : tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2849 : 882717 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2850 : : sizeof(BlockNumber), _bt_blk_cmp);
2851 : :
2852 [ + + ]: 882717 : if (!match)
2853 : : {
2854 [ - + ]: 546464 : Assert(!ItemIdIsDead(itemid));
2855 : 546464 : continue;
2856 : : }
2857 : :
2858 : : /*
2859 : : * TID's table block is among those pointed to by the TIDs from
2860 : : * LP_DEAD-bit set tuples on page -- add TID to deltids
2861 : : */
2862 : 336253 : odeltid->tid = itup->t_tid;
2863 : 336253 : odeltid->id = delstate.ndeltids;
2864 : 336253 : ostatus->idxoffnum = offnum;
2865 : 336253 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2866 : 336253 : ostatus->promising = false; /* unused */
2867 : 336253 : ostatus->freespace = 0; /* unused */
2868 : :
2869 : 336253 : delstate.ndeltids++;
2870 : : }
2871 : : else
2872 : : {
2873 : 38280 : int nitem = BTreeTupleGetNPosting(itup);
2874 : :
2875 [ + + ]: 183175 : for (int p = 0; p < nitem; p++)
2876 : : {
2877 : 144895 : ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2878 : :
2879 : 144895 : tidblock = ItemPointerGetBlockNumber(tid);
2880 : 144895 : match = bsearch(&tidblock, deadblocks, ndeadblocks,
2881 : : sizeof(BlockNumber), _bt_blk_cmp);
2882 : :
2883 [ + + ]: 144895 : if (!match)
2884 : : {
2885 [ - + ]: 124124 : Assert(!ItemIdIsDead(itemid));
2886 : 124124 : continue;
2887 : : }
2888 : :
2889 : : /*
2890 : : * TID's table block is among those pointed to by the TIDs
2891 : : * from LP_DEAD-bit set tuples on page -- add TID to deltids
2892 : : */
2893 : 20771 : odeltid->tid = *tid;
2894 : 20771 : odeltid->id = delstate.ndeltids;
2895 : 20771 : ostatus->idxoffnum = offnum;
2896 : 20771 : ostatus->knowndeletable = ItemIdIsDead(itemid);
2897 : 20771 : ostatus->promising = false; /* unused */
2898 : 20771 : ostatus->freespace = 0; /* unused */
2899 : :
2900 : 20771 : odeltid++;
2901 : 20771 : ostatus++;
2902 : 20771 : delstate.ndeltids++;
2903 : : }
2904 : : }
2905 : : }
2906 : :
2907 : 3256 : pfree(deadblocks);
2908 : :
2909 [ - + ]: 3256 : Assert(delstate.ndeltids >= ndeletable);
2910 : :
2911 : : /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2912 : 3256 : _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2913 : :
2914 : 3256 : pfree(delstate.deltids);
2915 : 3256 : pfree(delstate.status);
2916 : 3256 : }
2917 : :
2918 : : /*
2919 : : * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2920 : : *
2921 : : * Builds sorted and unique-ified array of table block numbers from index
2922 : : * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table
2923 : : * block from incoming newitem just in case it isn't among the LP_DEAD-related
2924 : : * table blocks.
2925 : : *
2926 : : * Always counting the newitem's table block as an LP_DEAD related block makes
2927 : : * sense because the cost is consistently low; it is practically certain that
2928 : : * the table block will not incur a buffer miss in tableam. On the other hand
2929 : : * the benefit is often quite high. There is a decent chance that there will
2930 : : * be some deletable items from this block, since in general most garbage
2931 : : * tuples became garbage in the recent past (in many cases this won't be the
2932 : : * first logical row that core code added to/modified in table block
2933 : : * recently).
2934 : : *
2935 : : * Returns final array, and sets *nblocks to its final size for caller.
2936 : : */
2937 : : static BlockNumber *
2938 : 3256 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2939 : : IndexTuple newitem, int *nblocks)
2940 : : {
2941 : : int spacentids,
2942 : : ntids;
2943 : : BlockNumber *tidblocks;
2944 : :
2945 : : /*
2946 : : * Accumulate each TID's block in array whose initial size has space for
2947 : : * one table block per LP_DEAD-set tuple (plus space for the newitem table
2948 : : * block). Array will only need to grow when there are LP_DEAD-marked
2949 : : * posting list tuples (which is not that common).
2950 : : */
2951 : 3256 : spacentids = ndeletable + 1;
2952 : 3256 : ntids = 0;
2953 : 3256 : tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2954 : :
2955 : : /*
2956 : : * First add the table block for the incoming newitem. This is the one
2957 : : * case where simple deletion can visit a table block that doesn't have
2958 : : * any known deletable items.
2959 : : */
2960 [ + - - + ]: 3256 : Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2961 : 3256 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2962 : :
2963 [ + + ]: 124232 : for (int i = 0; i < ndeletable; i++)
2964 : : {
2965 : 120976 : ItemId itemid = PageGetItemId(page, deletable[i]);
2966 : 120976 : IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);
2967 : :
2968 [ - + ]: 120976 : Assert(ItemIdIsDead(itemid));
2969 : :
2970 [ + + ]: 120976 : if (!BTreeTupleIsPosting(itup))
2971 : : {
2972 [ + + ]: 116836 : if (ntids + 1 > spacentids)
2973 : : {
2974 : 95 : spacentids *= 2;
2975 : : tidblocks = (BlockNumber *)
2976 : 95 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2977 : : }
2978 : :
2979 : 116836 : tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2980 : : }
2981 : : else
2982 : : {
2983 : 4140 : int nposting = BTreeTupleGetNPosting(itup);
2984 : :
2985 [ + + ]: 4140 : if (ntids + nposting > spacentids)
2986 : : {
2987 : 89 : spacentids = Max(spacentids * 2, ntids + nposting);
2988 : : tidblocks = (BlockNumber *)
2989 : 89 : repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2990 : : }
2991 : :
2992 [ + + ]: 13914 : for (int j = 0; j < nposting; j++)
2993 : : {
2994 : 9774 : ItemPointer tid = BTreeTupleGetPostingN(itup, j);
2995 : :
2996 : 9774 : tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
2997 : : }
2998 : : }
2999 : : }
3000 : :
3001 : 3256 : qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3002 : 3256 : *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3003 : :
3004 : 3256 : return tidblocks;
3005 : : }
3006 : :
3007 : : /*
3008 : : * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3009 : : */
3010 : : static inline int
3011 : 2332463 : _bt_blk_cmp(const void *arg1, const void *arg2)
3012 : : {
3013 : 2332463 : BlockNumber b1 = *((BlockNumber *) arg1);
3014 : 2332463 : BlockNumber b2 = *((BlockNumber *) arg2);
3015 : :
58 nathan@postgresql.or 3016 :GNC 2332463 : return pg_cmp_u32(b1, b2);
3017 : : }
|