|            TLA  Line data    Source code 
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtinsert.c
       4                 :  *    Item insertion in Lehman and Yao btrees for Postgres.
       5                 :  *
       6                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       7                 :  * Portions Copyright (c) 1994, Regents of the University of California
       8                 :  *
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/nbtree/nbtinsert.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include "access/nbtree.h"
      19                 : #include "access/nbtxlog.h"
      20                 : #include "access/transam.h"
      21                 : #include "access/xloginsert.h"
      22                 : #include "common/pg_prng.h"
      23                 : #include "lib/qunique.h"
      24                 : #include "miscadmin.h"
      25                 : #include "storage/lmgr.h"
      26                 : #include "storage/predicate.h"
      27                 : #include "storage/smgr.h"
      28                 : 
      29                 : /* Minimum tree height for application of fastpath optimization */
      30                 : #define BTREE_FASTPATH_MIN_LEVEL    2
      31                 : 
      32                 : 
      33                 : static BTStack _bt_search_insert(Relation rel, Relation heaprel,
      34                 :                                  BTInsertState insertstate);
      35                 : static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
      36                 :                                       Relation heapRel,
      37                 :                                       IndexUniqueCheck checkUnique, bool *is_unique,
      38                 :                                       uint32 *speculativeToken);
      39                 : static OffsetNumber _bt_findinsertloc(Relation rel,
      40                 :                                       BTInsertState insertstate,
      41                 :                                       bool checkingunique,
      42                 :                                       bool indexUnchanged,
      43                 :                                       BTStack stack,
      44                 :                                       Relation heapRel);
      45                 : static void _bt_stepright(Relation rel, Relation heaprel,
      46                 :                           BTInsertState insertstate, BTStack stack);
      47                 : static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
      48                 :                            Buffer buf,
      49                 :                            Buffer cbuf,
      50                 :                            BTStack stack,
      51                 :                            IndexTuple itup,
      52                 :                            Size itemsz,
      53                 :                            OffsetNumber newitemoff,
      54                 :                            int postingoff,
      55                 :                            bool split_only_page);
      56                 : static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
      57                 :                         Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
      58                 :                         Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
      59                 :                         IndexTuple nposting, uint16 postingoff);
      60                 : static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
      61                 :                               Buffer rbuf, BTStack stack, bool isroot, bool isonly);
      62                 : static Buffer _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
      63                 : static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
      64                 :                                 OffsetNumber itup_off, bool newfirstdataitem);
      65                 : static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
      66                 :                                          BTInsertState insertstate,
      67                 :                                          bool simpleonly, bool checkingunique,
      68                 :                                          bool uniquedup, bool indexUnchanged);
      69                 : static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
      70                 :                                OffsetNumber *deletable, int ndeletable,
      71                 :                                IndexTuple newitem, OffsetNumber minoff,
      72                 :                                OffsetNumber maxoff);
      73                 : static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
      74                 :                                    int ndeletable, IndexTuple newitem,
      75                 :                                    int *nblocks);
      76                 : static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
      77                 : 
      78                 : /*
      79                 :  *  _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
      80                 :  *
      81                 :  *      This routine is called by the public interface routine, btinsert.
      82                 :  *      By here, itup is filled in, including the TID.
      83                 :  *
      84                 :  *      If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
      85                 :  *      will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
      86                 :  *      UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
      87                 :  *      For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
      88                 :  *      don't actually insert.
      89                 :  *
      90                 :  *      indexUnchanged executor hint indicates if itup is from an
      91                 :  *      UPDATE that didn't logically change the indexed value, but
      92                 :  *      must nevertheless have a new entry to point to a successor
      93                 :  *      version.
      94                 :  *
      95                 :  *      The result value is only significant for UNIQUE_CHECK_PARTIAL:
      96                 :  *      it must be true if the entry is known unique, else false.
      97                 :  *      (In the current implementation we'll also return true after a
      98                 :  *      successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
      99                 :  *      that's just a coding artifact.)
     100                 :  */
     101                 : bool
     102 GIC     7167159 : _bt_doinsert(Relation rel, IndexTuple itup,
     103                 :              IndexUniqueCheck checkUnique, bool indexUnchanged,
     104 ECB             :              Relation heapRel)
     105                 : {
     106 GIC     7167159 :     bool        is_unique = false;
     107                 :     BTInsertStateData insertstate;
     108 ECB             :     BTScanInsert itup_key;
     109                 :     BTStack     stack;
     110 GIC     7167159 :     bool        checkingunique = (checkUnique != UNIQUE_CHECK_NO);
     111                 : 
     112 ECB             :     /* we need an insertion scan key to do our search, so build one */
     113 GNC     7167159 :     itup_key = _bt_mkscankey(rel, heapRel, itup);
     114                 : 
     115 CBC     7167159 :     if (checkingunique)
     116                 :     {
     117         5249429 :         if (!itup_key->anynullkeys)
     118                 :         {
     119 ECB             :             /* No (heapkeyspace) scantid until uniqueness established */
     120 GIC     5249351 :             itup_key->scantid = NULL;
     121                 :         }
     122 ECB             :         else
     123                 :         {
     124                 :             /*
     125                 :              * Scan key for new tuple contains NULL key values.  Bypass
     126                 :              * checkingunique steps.  They are unnecessary because core code
     127                 :              * considers NULL unequal to every value, including NULL.
     128                 :              *
     129                 :              * This optimization avoids O(N^2) behavior within the
     130                 :              * _bt_findinsertloc() heapkeyspace path when a unique index has a
     131                 :              * large number of "duplicates" with NULL key values.
     132                 :              */
     133 GIC          78 :             checkingunique = false;
     134                 :             /* Tuple is unique in the sense that core code cares about */
     135 CBC          78 :             Assert(checkUnique != UNIQUE_CHECK_EXISTING);
     136 GIC          78 :             is_unique = true;
     137 ECB             :         }
     138                 :     }
     139                 : 
     140                 :     /*
     141                 :      * Fill in the BTInsertState working area, to track the current page and
     142                 :      * position within the page to insert on.
     143                 :      *
     144                 :      * Note that itemsz is passed down to lower level code that deals with
     145                 :      * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
     146                 :      * accounting code consistently considers the alignment overhead that we
     147                 :      * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
     148                 :      * already conservative about alignment, but we don't rely on that from
     149                 :      * this distance.  Besides, preserving the "true" tuple size in index
     150                 :      * tuple headers for the benefit of nbtsplitloc.c might happen someday.
     151                 :      * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
     152                 :      */
     153 GIC     7167159 :     insertstate.itup = itup;
     154         7167159 :     insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
     155 CBC     7167159 :     insertstate.itup_key = itup_key;
     156         7167159 :     insertstate.bounds_valid = false;
     157         7167159 :     insertstate.buf = InvalidBuffer;
     158         7167159 :     insertstate.postingoff = 0;
     159 ECB             : 
     160 CBC     7167171 : search:
     161                 : 
     162 ECB             :     /*
     163                 :      * Find and lock the leaf page that the tuple should be added to by
     164                 :      * searching from the root page.  insertstate.buf will hold a buffer that
     165                 :      * is locked in exclusive mode afterwards.
     166                 :      */
     167 GNC     7167171 :     stack = _bt_search_insert(rel, heapRel, &insertstate);
     168                 : 
     169 ECB             :     /*
     170                 :      * checkingunique inserts are not allowed to go ahead when two tuples with
     171                 :      * equal key attribute values would be visible to new MVCC snapshots once
     172                 :      * the xact commits.  Check for conflicts in the locked page/buffer (if
     173                 :      * needed) here.
     174                 :      *
     175                 :      * It might be necessary to check a page to the right in _bt_check_unique,
     176                 :      * though that should be very rare.  In practice the first page the value
     177                 :      * could be on (with scantid omitted) is almost always also the only page
     178                 :      * that a matching tuple might be found on.  This is due to the behavior
     179                 :      * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
     180                 :      * only be allowed to cross a page boundary when there is no candidate
     181                 :      * leaf page split point that avoids it.  Also, _bt_check_unique can use
     182                 :      * the leaf page high key to determine that there will be no duplicates on
     183                 :      * the right sibling without actually visiting it (it uses the high key in
     184                 :      * cases where the new item happens to belong at the far right of the leaf
     185                 :      * page).
     186                 :      *
     187                 :      * NOTE: obviously, _bt_check_unique can only detect keys that are already
     188                 :      * in the index; so it cannot defend against concurrent insertions of the
     189                 :      * same key.  We protect against that by means of holding a write lock on
     190                 :      * the first page the value could be on, with omitted/-inf value for the
     191                 :      * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
     192                 :      * the same key must acquire a write lock on the same page, so only one
     193                 :      * would-be inserter can be making the check at one time.  Furthermore,
     194                 :      * once we are past the check we hold write locks continuously until we
     195                 :      * have performed our insertion, so no later inserter can fail to see our
     196                 :      * insertion.  (This requires some care in _bt_findinsertloc.)
     197                 :      *
     198                 :      * If we must wait for another xact, we release the lock while waiting,
     199                 :      * and then must perform a new search.
     200                 :      *
     201                 :      * For a partial uniqueness check, we don't wait for the other xact. Just
     202                 :      * let the tuple in and return false for possibly non-unique, or true for
     203                 :      * definitely unique.
     204                 :      */
     205 GIC     7167171 :     if (checkingunique)
     206                 :     {
     207 ECB             :         TransactionId xwait;
     208                 :         uint32      speculativeToken;
     209                 : 
     210 GIC     5249363 :         xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
     211                 :                                  &is_unique, &speculativeToken);
     212 ECB             : 
     213 GIC     5249110 :         if (unlikely(TransactionIdIsValid(xwait)))
     214                 :         {
     215 ECB             :             /* Have to wait for the other guy ... */
     216 GIC          14 :             _bt_relbuf(rel, insertstate.buf);
     217              14 :             insertstate.buf = InvalidBuffer;
     218 ECB             : 
     219                 :             /*
     220                 :              * If it's a speculative insertion, wait for it to finish (ie. to
     221                 :              * go ahead with the insertion, or kill the tuple).  Otherwise
     222                 :              * wait for the transaction to finish as usual.
     223                 :              */
     224 GIC          14 :             if (speculativeToken)
     225 UIC           0 :                 SpeculativeInsertionWait(xwait, speculativeToken);
     226 ECB             :             else
     227 GBC          14 :                 XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
     228                 : 
     229 ECB             :             /* start over... */
     230 GIC          12 :             if (stack)
     231 UIC           0 :                 _bt_freestack(stack);
     232 CBC          12 :             goto search;
     233 EUB             :         }
     234 ECB             : 
     235                 :         /* Uniqueness is established -- restore heap tid as scantid */
     236 GIC     5249096 :         if (itup_key->heapkeyspace)
     237         5249096 :             itup_key->scantid = &itup->t_tid;
     238 ECB             :     }
     239                 : 
     240 GIC     7166904 :     if (checkUnique != UNIQUE_CHECK_EXISTING)
     241                 :     {
     242 ECB             :         OffsetNumber newitemoff;
     243                 : 
     244                 :         /*
     245                 :          * The only conflict predicate locking cares about for indexes is when
     246                 :          * an index tuple insert conflicts with an existing lock.  We don't
     247                 :          * know the actual page we're going to insert on for sure just yet in
     248                 :          * checkingunique and !heapkeyspace cases, but it's okay to use the
     249                 :          * first page the value could be on (with scantid omitted) instead.
     250                 :          */
     251 GIC     7166877 :         CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
     252                 : 
     253 ECB             :         /*
     254                 :          * Do the insertion.  Note that insertstate contains cached binary
     255                 :          * search bounds established within _bt_check_unique when insertion is
     256                 :          * checkingunique.
     257                 :          */
     258 GIC     7166874 :         newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
     259                 :                                        indexUnchanged, stack, heapRel);
     260 GNC     7166874 :         _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
     261                 :                        stack, itup, insertstate.itemsz, newitemoff,
     262 ECB             :                        insertstate.postingoff, false);
     263                 :     }
     264                 :     else
     265                 :     {
     266                 :         /* just release the buffer */
     267 GIC          27 :         _bt_relbuf(rel, insertstate.buf);
     268                 :     }
     269 ECB             : 
     270                 :     /* be tidy */
     271 GIC     7166901 :     if (stack)
     272         5665174 :         _bt_freestack(stack);
     273 CBC     7166901 :     pfree(itup_key);
     274 ECB             : 
     275 CBC     7166901 :     return is_unique;
     276                 : }
     277 ECB             : 
     278                 : /*
     279                 :  *  _bt_search_insert() -- _bt_search() wrapper for inserts
     280                 :  *
     281                 :  * Search the tree for a particular scankey, or more precisely for the first
     282                 :  * leaf page it could be on.  Try to make use of the fastpath optimization's
     283                 :  * rightmost leaf page cache before actually searching the tree from the root
     284                 :  * page, though.
     285                 :  *
     286                 :  * Return value is a stack of parent-page pointers (though see notes about
     287                 :  * fastpath optimization and page splits below).  insertstate->buf is set to
     288                 :  * the address of the leaf-page buffer, which is write-locked and pinned in
     289                 :  * all cases (if necessary by creating a new empty root page for caller).
     290                 :  *
     291                 :  * The fastpath optimization avoids most of the work of searching the tree
     292                 :  * repeatedly when a single backend inserts successive new tuples on the
     293                 :  * rightmost leaf page of an index.  A backend cache of the rightmost leaf
     294                 :  * page is maintained within _bt_insertonpg(), and used here.  The cache is
     295                 :  * invalidated here when an insert of a non-pivot tuple must take place on a
     296                 :  * non-rightmost leaf page.
     297                 :  *
     298                 :  * The optimization helps with indexes on an auto-incremented field.  It also
     299                 :  * helps with indexes on datetime columns, as well as indexes with lots of
     300                 :  * NULL values.  (NULLs usually get inserted in the rightmost page for single
     301                 :  * column indexes, since they usually get treated as coming after everything
     302                 :  * else in the key space.  Individual NULL tuples will generally be placed on
     303                 :  * the rightmost leaf page due to the influence of the heap TID column.)
     304                 :  *
     305                 :  * Note that we avoid applying the optimization when there is insufficient
     306                 :  * space on the rightmost page to fit caller's new item.  This is necessary
     307                 :  * because we'll need to return a real descent stack when a page split is
     308                 :  * expected (actually, caller can cope with a leaf page split that uses a NULL
     309                 :  * stack, but that's very slow and so must be avoided).  Note also that the
     310                 :  * fastpath optimization acquires the lock on the page conditionally as a way
     311                 :  * of reducing extra contention when there are concurrent insertions into the
     312                 :  * rightmost page (we give up if we'd have to wait for the lock).  We assume
     313                 :  * that it isn't useful to apply the optimization when there is contention,
     314                 :  * since each per-backend cache won't stay valid for long.
     315                 :  */
     316                 : static BTStack
     317 GNC     7167171 : _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
     318                 : {
     319 CBC     7167171 :     Assert(insertstate->buf == InvalidBuffer);
     320 GIC     7167171 :     Assert(!insertstate->bounds_valid);
     321 CBC     7167171 :     Assert(insertstate->postingoff == 0);
     322 ECB             : 
     323 CBC     7167171 :     if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
     324                 :     {
     325 ECB             :         /* Simulate a _bt_getbuf() call with conditional locking */
     326 GIC       34265 :         insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
     327           34265 :         if (_bt_conditionallockbuf(rel, insertstate->buf))
     328 ECB             :         {
     329                 :             Page        page;
     330                 :             BTPageOpaque opaque;
     331                 : 
     332 GIC       34019 :             _bt_checkpage(rel, insertstate->buf);
     333           34019 :             page = BufferGetPage(insertstate->buf);
     334 CBC       34019 :             opaque = BTPageGetOpaque(page);
     335 ECB             : 
     336                 :             /*
     337                 :              * Check if the page is still the rightmost leaf page and has
     338                 :              * enough free space to accommodate the new tuple.  Also check
     339                 :              * that the insertion scan key is strictly greater than the first
     340                 :              * non-pivot tuple on the page.  (Note that we expect itup_key's
     341                 :              * scantid to be unset when our caller is a checkingunique
     342                 :              * inserter.)
     343                 :              */
     344 GIC       34019 :             if (P_RIGHTMOST(opaque) &&
     345           33973 :                 P_ISLEAF(opaque) &&
     346 CBC       33973 :                 !P_IGNORE(opaque) &&
     347           67742 :                 PageGetFreeSpace(page) > insertstate->itemsz &&
     348           67538 :                 PageGetMaxOffsetNumber(page) >= P_HIKEY &&
     349           33769 :                 _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
     350 ECB             :             {
     351                 :                 /*
     352                 :                  * Caller can use the fastpath optimization because cached
     353                 :                  * block is still rightmost leaf page, which can fit caller's
     354                 :                  * new tuple without splitting.  Keep block in local cache for
     355                 :                  * next insert, and have caller use NULL stack.
     356                 :                  *
     357                 :                  * Note that _bt_insert_parent() has an assertion that catches
     358                 :                  * leaf page splits that somehow follow from a fastpath insert
     359                 :                  * (it should only be passed a NULL stack when it must deal
     360                 :                  * with a concurrent root page split, and never because a NULL
     361                 :                  * stack was returned here).
     362                 :                  */
     363 GIC       33747 :                 return NULL;
     364                 :             }
     365 ECB             : 
     366                 :             /* Page unsuitable for caller, drop lock and pin */
     367 GIC         272 :             _bt_relbuf(rel, insertstate->buf);
     368                 :         }
     369 ECB             :         else
     370                 :         {
     371                 :             /* Lock unavailable, drop pin */
     372 GIC         246 :             ReleaseBuffer(insertstate->buf);
     373                 :         }
     374 ECB             : 
     375                 :         /* Forget block, since cache doesn't appear to be useful */
     376 GIC         518 :         RelationSetTargetBlock(rel, InvalidBlockNumber);
     377                 :     }
     378 ECB             : 
     379                 :     /* Cannot use optimization -- descend tree, return proper descent stack */
     380 GNC     7133424 :     return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
     381                 :                       BT_WRITE, NULL);
     382 ECB             : }
     383                 : 
     384                 : /*
     385                 :  *  _bt_check_unique() -- Check for violation of unique index constraint
     386                 :  *
     387                 :  * Returns InvalidTransactionId if there is no conflict, else an xact ID
     388                 :  * we must wait for to see if it commits a conflicting tuple.   If an actual
     389                 :  * conflict is detected, no return --- just ereport().  If an xact ID is
     390                 :  * returned, and the conflicting tuple still has a speculative insertion in
     391                 :  * progress, *speculativeToken is set to non-zero, and the caller can wait for
     392                 :  * the verdict on the insertion using SpeculativeInsertionWait().
     393                 :  *
     394                 :  * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
     395                 :  * InvalidTransactionId because we don't want to wait.  In this case we
     396                 :  * set *is_unique to false if there is a potential conflict, and the
     397                 :  * core code must redo the uniqueness check later.
     398                 :  *
     399                 :  * As a side-effect, sets state in insertstate that can later be used by
     400                 :  * _bt_findinsertloc() to reuse most of the binary search work we do
     401                 :  * here.
     402                 :  *
     403                 :  * This code treats NULLs as equal, unlike the default semantics for unique
     404                 :  * indexes.  So do not call here when there are NULL values in scan key and
     405                 :  * the index uses the default NULLS DISTINCT mode.
     406                 :  */
     407                 : static TransactionId
     408 GIC     5249363 : _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
     409                 :                  IndexUniqueCheck checkUnique, bool *is_unique,
     410 ECB             :                  uint32 *speculativeToken)
     411                 : {
     412 GIC     5249363 :     IndexTuple  itup = insertstate->itup;
     413         5249363 :     IndexTuple  curitup = NULL;
     414 CBC     5249363 :     ItemId      curitemid = NULL;
     415         5249363 :     BTScanInsert itup_key = insertstate->itup_key;
     416 ECB             :     SnapshotData SnapshotDirty;
     417                 :     OffsetNumber offset;
     418                 :     OffsetNumber maxoff;
     419                 :     Page        page;
     420                 :     BTPageOpaque opaque;
     421 GIC     5249363 :     Buffer      nbuf = InvalidBuffer;
     422         5249363 :     bool        found = false;
     423 CBC     5249363 :     bool        inposting = false;
     424         5249363 :     bool        prevalldead = true;
     425         5249363 :     int         curposti = 0;
     426 ECB             : 
     427                 :     /* Assume unique until we find a duplicate */
     428 GIC     5249363 :     *is_unique = true;
     429                 : 
     430 CBC     5249363 :     InitDirtySnapshot(SnapshotDirty);
     431                 : 
     432         5249363 :     page = BufferGetPage(insertstate->buf);
     433 GIC     5249363 :     opaque = BTPageGetOpaque(page);
     434 CBC     5249363 :     maxoff = PageGetMaxOffsetNumber(page);
     435 ECB             : 
     436                 :     /*
     437                 :      * Find the first tuple with the same key.
     438                 :      *
     439                 :      * This also saves the binary search bounds in insertstate.  We use them
     440                 :      * in the fastpath below, but also in the _bt_findinsertloc() call later.
     441                 :      */
     442 GIC     5249363 :     Assert(!insertstate->bounds_valid);
     443         5249363 :     offset = _bt_binsrch_insert(rel, insertstate);
     444 ECB             : 
     445                 :     /*
     446                 :      * Scan over all equal tuples, looking for live conflicts.
     447                 :      */
     448 GIC     5249363 :     Assert(!insertstate->bounds_valid || insertstate->low == offset);
     449         5249363 :     Assert(!itup_key->anynullkeys);
     450 CBC     5249363 :     Assert(itup_key->scantid == NULL);
     451 ECB             :     for (;;)
     452                 :     {
     453                 :         /*
     454                 :          * Each iteration of the loop processes one heap TID, not one index
     455                 :          * tuple.  Current offset number for page isn't usually advanced on
     456                 :          * iterations that process heap TIDs from posting list tuples.
     457                 :          *
     458                 :          * "inposting" state is set when _inside_ a posting list --- not when
     459                 :          * we're at the start (or end) of a posting list.  We advance curposti
     460                 :          * at the end of the iteration when inside a posting list tuple.  In
     461                 :          * general, every loop iteration either advances the page offset or
     462                 :          * advances curposti --- an iteration that handles the rightmost/max
     463                 :          * heap TID in a posting list finally advances the page offset (and
     464                 :          * unsets "inposting").
     465                 :          *
     466                 :          * Make sure the offset points to an actual index tuple before trying
     467                 :          * to examine it...
     468                 :          */
     469 GIC    11286194 :         if (offset <= maxoff)
     470                 :         {
     471 ECB             :             /*
     472                 :              * Fastpath: In most cases, we can use cached search bounds to
     473                 :              * limit our consideration to items that are definitely
     474                 :              * duplicates.  This fastpath doesn't apply when the original page
     475                 :              * is empty, or when initial offset is past the end of the
     476                 :              * original page, which may indicate that we need to examine a
     477                 :              * second or subsequent page.
     478                 :              *
     479                 :              * Note that this optimization allows us to avoid calling
     480                 :              * _bt_compare() directly when there are no duplicates, as long as
     481                 :              * the offset where the key will go is not at the end of the page.
     482                 :              */
     483 GIC     8446167 :             if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
     484                 :             {
     485 CBC     2256132 :                 Assert(insertstate->bounds_valid);
     486 GIC     2256132 :                 Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
     487 CBC     2256132 :                 Assert(insertstate->low <= insertstate->stricthigh);
     488         2256132 :                 Assert(_bt_compare(rel, itup_key, page, offset) < 0);
     489         2256132 :                 break;
     490 ECB             :             }
     491                 : 
     492                 :             /*
     493                 :              * We can skip items that are already marked killed.
     494                 :              *
     495                 :              * In the presence of heavy update activity an index may contain
     496                 :              * many killed items with the same key; running _bt_compare() on
     497                 :              * each killed item gets expensive.  Just advance over killed
     498                 :              * items as quickly as we can.  We only apply _bt_compare() when
     499                 :              * we get to a non-killed item.  We could reuse the bounds to
     500                 :              * avoid _bt_compare() calls for known equal tuples, but it
     501                 :              * doesn't seem worth it.
     502                 :              */
     503 GIC     6190035 :             if (!inposting)
     504         3933758 :                 curitemid = PageGetItemId(page, offset);
     505 CBC     6190035 :             if (inposting || !ItemIdIsDead(curitemid))
     506 ECB             :             {
     507                 :                 ItemPointerData htid;
     508 GIC     5955716 :                 bool        all_dead = false;
     509                 : 
     510 CBC     5955716 :                 if (!inposting)
     511                 :                 {
     512 ECB             :                     /* Plain tuple, or first TID in posting list tuple */
     513 GIC     3699439 :                     if (_bt_compare(rel, itup_key, page, offset) != 0)
     514          138699 :                         break;  /* we're past all the equal tuples */
     515 ECB             : 
     516                 :                     /* Advanced curitup */
     517 GIC     3560740 :                     curitup = (IndexTuple) PageGetItem(page, curitemid);
     518         3560740 :                     Assert(!BTreeTupleIsPivot(curitup));
     519 ECB             :                 }
     520                 : 
     521                 :                 /* okay, we gotta fetch the heap tuple using htid ... */
     522 GIC     5817017 :                 if (!BTreeTupleIsPosting(curitup))
     523                 :                 {
     524 ECB             :                     /* ... htid is from simple non-pivot tuple */
     525 GIC     3538601 :                     Assert(!inposting);
     526         3538601 :                     htid = curitup->t_tid;
     527 ECB             :                 }
     528 CBC     2278416 :                 else if (!inposting)
     529                 :                 {
     530 ECB             :                     /* ... htid is first TID in new posting list */
     531 GIC       22139 :                     inposting = true;
     532           22139 :                     prevalldead = true;
     533 CBC       22139 :                     curposti = 0;
     534           22139 :                     htid = *BTreeTupleGetPostingN(curitup, 0);
     535 ECB             :                 }
     536                 :                 else
     537                 :                 {
     538                 :                     /* ... htid is second or subsequent TID in posting list */
     539 GIC     2256277 :                     Assert(curposti > 0);
     540         2256277 :                     htid = *BTreeTupleGetPostingN(curitup, curposti);
     541 ECB             :                 }
     542                 : 
     543                 :                 /*
     544                 :                  * If we are doing a recheck, we expect to find the tuple we
     545                 :                  * are rechecking.  It's not a duplicate, but we have to keep
     546                 :                  * scanning.
     547                 :                  */
     548 GIC     5817118 :                 if (checkUnique == UNIQUE_CHECK_EXISTING &&
     549             101 :                     ItemPointerCompare(&htid, &itup->t_tid) == 0)
     550 ECB             :                 {
     551 CBC          27 :                     found = true;
     552                 :                 }
     553 ECB             : 
     554                 :                 /*
     555                 :                  * Check if there's any table tuples for this index entry
     556                 :                  * satisfying SnapshotDirty. This is necessary because for AMs
     557                 :                  * with optimizations like heap's HOT, we have just a single
     558                 :                  * index entry for the entire chain.
     559                 :                  */
     560 GIC     5816990 :                 else if (table_index_fetch_tuple_check(heapRel, &htid,
     561                 :                                                        &SnapshotDirty,
     562 ECB             :                                                        &all_dead))
     563                 :                 {
     564                 :                     TransactionId xwait;
     565                 : 
     566                 :                     /*
     567                 :                      * It is a duplicate. If we are only doing a partial
     568                 :                      * check, then don't bother checking if the tuple is being
     569                 :                      * updated in another transaction. Just return the fact
     570                 :                      * that it is a potential conflict and leave the full
     571                 :                      * check till later. Don't invalidate binary search
     572                 :                      * bounds.
     573                 :                      */
     574 GIC         321 :                     if (checkUnique == UNIQUE_CHECK_PARTIAL)
     575                 :                     {
     576 CBC          54 :                         if (nbuf != InvalidBuffer)
     577 UIC           0 :                             _bt_relbuf(rel, nbuf);
     578 CBC          54 :                         *is_unique = false;
     579 GBC          68 :                         return InvalidTransactionId;
     580 ECB             :                     }
     581                 : 
     582                 :                     /*
     583                 :                      * If this tuple is being updated by other transaction
     584                 :                      * then we have to wait for its commit/abort.
     585                 :                      */
     586 GIC         534 :                     xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
     587             267 :                         SnapshotDirty.xmin : SnapshotDirty.xmax;
     588 ECB             : 
     589 CBC         267 :                     if (TransactionIdIsValid(xwait))
     590                 :                     {
     591              14 :                         if (nbuf != InvalidBuffer)
     592 UIC           0 :                             _bt_relbuf(rel, nbuf);
     593 ECB             :                         /* Tell _bt_doinsert to wait... */
     594 GBC          14 :                         *speculativeToken = SnapshotDirty.speculativeToken;
     595                 :                         /* Caller releases lock on buf immediately */
     596 CBC          14 :                         insertstate->bounds_valid = false;
     597 GIC          14 :                         return xwait;
     598 ECB             :                     }
     599                 : 
     600                 :                     /*
     601                 :                      * Otherwise we have a definite conflict.  But before
     602                 :                      * complaining, look to see if the tuple we want to insert
     603                 :                      * is itself now committed dead --- if so, don't complain.
     604                 :                      * This is a waste of time in normal scenarios but we must
     605                 :                      * do it to support CREATE INDEX CONCURRENTLY.
     606                 :                      *
     607                 :                      * We must follow HOT-chains here because during
     608                 :                      * concurrent index build, we insert the root TID though
     609                 :                      * the actual tuple may be somewhere in the HOT-chain.
     610                 :                      * While following the chain we might not stop at the
     611                 :                      * exact tuple which triggered the insert, but that's OK
     612                 :                      * because if we find a live tuple anywhere in this chain,
     613                 :                      * we have a unique key conflict.  The other live tuple is
     614                 :                      * not part of this chain because it had a different index
     615                 :                      * entry.
     616                 :                      */
     617 GIC         253 :                     htid = itup->t_tid;
     618             253 :                     if (table_index_fetch_tuple_check(heapRel, &htid,
     619 ECB             :                                                       SnapshotSelf, NULL))
     620                 :                     {
     621                 :                         /* Normal case --- it's still live */
     622                 :                     }
     623                 :                     else
     624                 :                     {
     625                 :                         /*
     626                 :                          * It's been deleted, so no error, and no need to
     627                 :                          * continue searching
     628                 :                          */
     629 UIC           0 :                         break;
     630                 :                     }
     631 EUB             : 
     632                 :                     /*
     633                 :                      * Check for a conflict-in as we would if we were going to
     634                 :                      * write to this page.  We aren't actually going to write,
     635                 :                      * but we want a chance to report SSI conflicts that would
     636                 :                      * otherwise be masked by this unique constraint
     637                 :                      * violation.
     638                 :                      */
     639 GIC         253 :                     CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
     640                 : 
     641 ECB             :                     /*
     642                 :                      * This is a definite conflict.  Break the tuple down into
     643                 :                      * datums and report the error.  But first, make sure we
     644                 :                      * release the buffer locks we're holding ---
     645                 :                      * BuildIndexValueDescription could make catalog accesses,
     646                 :                      * which in the worst case might touch this same index and
     647                 :                      * cause deadlocks.
     648                 :                      */
     649 GIC         249 :                     if (nbuf != InvalidBuffer)
     650 UIC           0 :                         _bt_relbuf(rel, nbuf);
     651 CBC         249 :                     _bt_relbuf(rel, insertstate->buf);
     652 GBC         249 :                     insertstate->buf = InvalidBuffer;
     653 CBC         249 :                     insertstate->bounds_valid = false;
     654 ECB             : 
     655                 :                     {
     656                 :                         Datum       values[INDEX_MAX_KEYS];
     657                 :                         bool        isnull[INDEX_MAX_KEYS];
     658                 :                         char       *key_desc;
     659                 : 
     660 GIC         249 :                         index_deform_tuple(itup, RelationGetDescr(rel),
     661                 :                                            values, isnull);
     662 ECB             : 
     663 GIC         249 :                         key_desc = BuildIndexValueDescription(rel, values,
     664                 :                                                               isnull);
     665 ECB             : 
     666 GIC         249 :                         ereport(ERROR,
     667                 :                                 (errcode(ERRCODE_UNIQUE_VIOLATION),
     668 ECB             :                                  errmsg("duplicate key value violates unique constraint \"%s\"",
     669                 :                                         RelationGetRelationName(rel)),
     670                 :                                  key_desc ? errdetail("Key %s already exists.",
     671                 :                                                       key_desc) : 0,
     672                 :                                  errtableconstraint(heapRel,
     673                 :                                                     RelationGetRelationName(rel))));
     674                 :                     }
     675                 :                 }
     676 GIC     5816669 :                 else if (all_dead && (!inposting ||
     677           14368 :                                       (prevalldead &&
     678 CBC       14368 :                                        curposti == BTreeTupleGetNPosting(curitup) - 1)))
     679 ECB             :                 {
     680                 :                     /*
     681                 :                      * The conflicting tuple (or all HOT chains pointed to by
     682                 :                      * all posting list TIDs) is dead to everyone, so mark the
     683                 :                      * index entry killed.
     684                 :                      */
     685 GIC       47834 :                     ItemIdMarkDead(curitemid);
     686           47834 :                     opaque->btpo_flags |= BTP_HAS_GARBAGE;
     687 ECB             : 
     688                 :                     /*
     689                 :                      * Mark buffer with a dirty hint, since state is not
     690                 :                      * crucial. Be sure to mark the proper buffer dirty.
     691                 :                      */
     692 GIC       47834 :                     if (nbuf != InvalidBuffer)
     693               3 :                         MarkBufferDirtyHint(nbuf, true);
     694 ECB             :                     else
     695 CBC       47831 :                         MarkBufferDirtyHint(insertstate->buf, true);
     696                 :                 }
     697 ECB             : 
     698                 :                 /*
     699                 :                  * Remember if posting list tuple has even a single HOT chain
     700                 :                  * whose members are not all dead
     701                 :                  */
     702 GIC     5816696 :                 if (!all_dead && inposting)
     703         2263817 :                     prevalldead = false;
     704 ECB             :             }
     705                 :         }
     706                 : 
     707 GIC     8891042 :         if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
     708                 :         {
     709 ECB             :             /* Advance to next TID in same posting list */
     710 GIC     2256277 :             curposti++;
     711         2256277 :             continue;
     712 ECB             :         }
     713 CBC     6634765 :         else if (offset < maxoff)
     714                 :         {
     715 ECB             :             /* Advance to next tuple */
     716 GIC     3775532 :             curposti = 0;
     717         3775532 :             inposting = false;
     718 CBC     3775532 :             offset = OffsetNumberNext(offset);
     719 ECB             :         }
     720                 :         else
     721                 :         {
     722                 :             int         highkeycmp;
     723                 : 
     724                 :             /* If scankey == hikey we gotta check the next page too */
     725 GIC     2859233 :             if (P_RIGHTMOST(opaque))
     726         2782665 :                 break;
     727 CBC       76568 :             highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
     728           76568 :             Assert(highkeycmp <= 0);
     729           76568 :             if (highkeycmp != 0)
     730           71546 :                 break;
     731 ECB             :             /* Advance to next non-dead page --- there must be one */
     732                 :             for (;;)
     733 UIC           0 :             {
     734 GIC        5022 :                 BlockNumber nblkno = opaque->btpo_next;
     735 EUB             : 
     736 CBC        5022 :                 nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
     737 GIC        5022 :                 page = BufferGetPage(nbuf);
     738 CBC        5022 :                 opaque = BTPageGetOpaque(page);
     739            5022 :                 if (!P_IGNORE(opaque))
     740            5022 :                     break;
     741 LBC           0 :                 if (P_RIGHTMOST(opaque))
     742               0 :                     elog(ERROR, "fell off the end of index \"%s\"",
     743 EUB             :                          RelationGetRelationName(rel));
     744                 :             }
     745                 :             /* Will also advance to next tuple */
     746 GIC        5022 :             curposti = 0;
     747            5022 :             inposting = false;
     748 CBC        5022 :             maxoff = PageGetMaxOffsetNumber(page);
     749            5022 :             offset = P_FIRSTDATAKEY(opaque);
     750 ECB             :             /* Don't invalidate binary search bounds */
     751                 :         }
     752                 :     }
     753                 : 
     754                 :     /*
     755                 :      * If we are doing a recheck then we should have found the tuple we are
     756                 :      * checking.  Otherwise there's something very wrong --- probably, the
     757                 :      * index is on a non-immutable expression.
     758                 :      */
     759 GIC     5249042 :     if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
     760 UIC           0 :         ereport(ERROR,
     761 ECB             :                 (errcode(ERRCODE_INTERNAL_ERROR),
     762 EUB             :                  errmsg("failed to re-find tuple within index \"%s\"",
     763                 :                         RelationGetRelationName(rel)),
     764                 :                  errhint("This may be because of a non-immutable index expression."),
     765                 :                  errtableconstraint(heapRel,
     766                 :                                     RelationGetRelationName(rel))));
     767                 : 
     768 GIC     5249042 :     if (nbuf != InvalidBuffer)
     769            2880 :         _bt_relbuf(rel, nbuf);
     770 ECB             : 
     771 CBC     5249042 :     return InvalidTransactionId;
     772                 : }
     773 ECB             : 
     774                 : 
     775                 : /*
     776                 :  *  _bt_findinsertloc() -- Finds an insert location for a tuple
     777                 :  *
     778                 :  *      On entry, insertstate buffer contains the page the new tuple belongs
     779                 :  *      on.  It is exclusive-locked and pinned by the caller.
     780                 :  *
     781                 :  *      If 'checkingunique' is true, the buffer on entry is the first page
     782                 :  *      that contains duplicates of the new key.  If there are duplicates on
     783                 :  *      multiple pages, the correct insertion position might be some page to
     784                 :  *      the right, rather than the first page.  In that case, this function
     785                 :  *      moves right to the correct target page.
     786                 :  *
     787                 :  *      (In a !heapkeyspace index, there can be multiple pages with the same
     788                 :  *      high key, where the new tuple could legitimately be placed on.  In
     789                 :  *      that case, the caller passes the first page containing duplicates,
     790                 :  *      just like when checkingunique=true.  If that page doesn't have enough
     791                 :  *      room for the new tuple, this function moves right, trying to find a
     792                 :  *      legal page that does.)
     793                 :  *
     794                 :  *      If 'indexUnchanged' is true, this is for an UPDATE that didn't
     795                 :  *      logically change the indexed value, but must nevertheless have a new
     796                 :  *      entry to point to a successor version.  This hint from the executor
     797                 :  *      will influence our behavior when the page might have to be split and
     798                 :  *      we must consider our options.  Bottom-up index deletion can avoid
     799                 :  *      pathological version-driven page splits, but we only want to go to the
     800                 :  *      trouble of trying it when we already have moderate confidence that
     801                 :  *      it's appropriate.  The hint should not significantly affect our
     802                 :  *      behavior over time unless practically all inserts on to the leaf page
     803                 :  *      get the hint.
     804                 :  *
     805                 :  *      On exit, insertstate buffer contains the chosen insertion page, and
     806                 :  *      the offset within that page is returned.  If _bt_findinsertloc needed
     807                 :  *      to move right, the lock and pin on the original page are released, and
     808                 :  *      the new buffer is exclusively locked and pinned instead.
     809                 :  *
     810                 :  *      If insertstate contains cached binary search bounds, we will take
     811                 :  *      advantage of them.  This avoids repeating comparisons that we made in
     812                 :  *      _bt_check_unique() already.
     813                 :  */
     814                 : static OffsetNumber
     815 GIC     7166874 : _bt_findinsertloc(Relation rel,
     816                 :                   BTInsertState insertstate,
     817 ECB             :                   bool checkingunique,
     818                 :                   bool indexUnchanged,
     819                 :                   BTStack stack,
     820                 :                   Relation heapRel)
     821                 : {
     822 GIC     7166874 :     BTScanInsert itup_key = insertstate->itup_key;
     823         7166874 :     Page        page = BufferGetPage(insertstate->buf);
     824 ECB             :     BTPageOpaque opaque;
     825                 :     OffsetNumber newitemoff;
     826                 : 
     827 GIC     7166874 :     opaque = BTPageGetOpaque(page);
     828                 : 
     829 ECB             :     /* Check 1/3 of a page restriction */
     830 GIC     7166874 :     if (unlikely(insertstate->itemsz > BTMaxItemSize(page)))
     831 UIC           0 :         _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
     832 ECB             :                              insertstate->itup);
     833 EUB             : 
     834 GIC     7166874 :     Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
     835         7166874 :     Assert(!insertstate->bounds_valid || checkingunique);
     836 CBC     7166874 :     Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
     837         7166874 :     Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
     838         7166874 :     Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
     839 ECB             : 
     840 CBC     7166874 :     if (itup_key->heapkeyspace)
     841                 :     {
     842 ECB             :         /* Keep track of whether checkingunique duplicate seen */
     843 GIC     7166874 :         bool        uniquedup = indexUnchanged;
     844                 : 
     845 ECB             :         /*
     846                 :          * If we're inserting into a unique index, we may have to walk right
     847                 :          * through leaf pages to find the one leaf page that we must insert on
     848                 :          * to.
     849                 :          *
     850                 :          * This is needed for checkingunique callers because a scantid was not
     851                 :          * used when we called _bt_search().  scantid can only be set after
     852                 :          * _bt_check_unique() has checked for duplicates.  The buffer
     853                 :          * initially stored in insertstate->buf has the page where the first
     854                 :          * duplicate key might be found, which isn't always the page that new
     855                 :          * tuple belongs on.  The heap TID attribute for new tuple (scantid)
     856                 :          * could force us to insert on a sibling page, though that should be
     857                 :          * very rare in practice.
     858                 :          */
     859 GIC     7166874 :         if (checkingunique)
     860                 :         {
     861 CBC     5249066 :             if (insertstate->low < insertstate->stricthigh)
     862                 :             {
     863 ECB             :                 /* Encountered a duplicate in _bt_check_unique() */
     864 GIC      317864 :                 Assert(insertstate->bounds_valid);
     865          317864 :                 uniquedup = true;
     866 ECB             :             }
     867                 : 
     868                 :             for (;;)
     869                 :             {
     870                 :                 /*
     871                 :                  * Does the new tuple belong on this page?
     872                 :                  *
     873                 :                  * The earlier _bt_check_unique() call may well have
     874                 :                  * established a strict upper bound on the offset for the new
     875                 :                  * item.  If it's not the last item of the page (i.e. if there
     876                 :                  * is at least one tuple on the page that goes after the tuple
     877                 :                  * we're inserting) then we know that the tuple belongs on
     878                 :                  * this page.  We can skip the high key check.
     879                 :                  */
     880 GIC     5254088 :                 if (insertstate->bounds_valid &&
     881        10481378 :                     insertstate->low <= insertstate->stricthigh &&
     882 CBC     5240689 :                     insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     883         2380612 :                     break;
     884 ECB             : 
     885                 :                 /* Test '<=', not '!=', since scantid is set now */
     886 GIC     2957725 :                 if (P_RIGHTMOST(opaque) ||
     887           84249 :                     _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
     888 ECB             :                     break;
     889                 : 
     890 GNC        5022 :                 _bt_stepright(rel, heapRel, insertstate, stack);
     891                 :                 /* Update local state after stepping right */
     892 CBC        5022 :                 page = BufferGetPage(insertstate->buf);
     893 GIC        5022 :                 opaque = BTPageGetOpaque(page);
     894 ECB             :                 /* Assume duplicates (if checkingunique) */
     895 CBC        5022 :                 uniquedup = true;
     896                 :             }
     897 ECB             :         }
     898                 : 
     899                 :         /*
     900                 :          * If the target page cannot fit newitem, try to avoid splitting the
     901                 :          * page on insert by performing deletion or deduplication now
     902                 :          */
     903 GIC     7166874 :         if (PageGetFreeSpace(page) < insertstate->itemsz)
     904           50681 :             _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
     905 ECB             :                                          checkingunique, uniquedup,
     906                 :                                          indexUnchanged);
     907                 :     }
     908                 :     else
     909                 :     {
     910                 :         /*----------
     911                 :          * This is a !heapkeyspace (version 2 or 3) index.  The current page
     912                 :          * is the first page that we could insert the new tuple to, but there
     913                 :          * may be other pages to the right that we could opt to use instead.
     914                 :          *
     915                 :          * If the new key is equal to one or more existing keys, we can
     916                 :          * legitimately place it anywhere in the series of equal keys.  In
     917                 :          * fact, if the new key is equal to the page's "high key" we can place
     918                 :          * it on the next page.  If it is equal to the high key, and there's
     919                 :          * not room to insert the new tuple on the current page without
     920                 :          * splitting, then we move right hoping to find more free space and
     921                 :          * avoid a split.
     922                 :          *
     923                 :          * Keep scanning right until we
     924                 :          *      (a) find a page with enough free space,
     925                 :          *      (b) reach the last page where the tuple can legally go, or
     926                 :          *      (c) get tired of searching.
     927                 :          * (c) is not flippant; it is important because if there are many
     928                 :          * pages' worth of equal keys, it's better to split one of the early
     929                 :          * pages than to scan all the way to the end of the run of equal keys
     930                 :          * on every insert.  We implement "get tired" as a random choice,
     931                 :          * since stopping after scanning a fixed number of pages wouldn't work
     932                 :          * well (we'd never reach the right-hand side of previously split
     933                 :          * pages).  The probability of moving right is set at 0.99, which may
     934                 :          * seem too high to change the behavior much, but it does an excellent
     935                 :          * job of preventing O(N^2) behavior with many equal keys.
     936                 :          *----------
     937                 :          */
     938 UIC           0 :         while (PageGetFreeSpace(page) < insertstate->itemsz)
     939                 :         {
     940 EUB             :             /*
     941                 :              * Before considering moving right, see if we can obtain enough
     942                 :              * space by erasing LP_DEAD items
     943                 :              */
     944 UIC           0 :             if (P_HAS_GARBAGE(opaque))
     945                 :             {
     946 EUB             :                 /* Perform simple deletion */
     947 UIC           0 :                 _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     948                 :                                              false, false, false);
     949 EUB             : 
     950 UIC           0 :                 if (PageGetFreeSpace(page) >= insertstate->itemsz)
     951               0 :                     break;      /* OK, now we have enough space */
     952 EUB             :             }
     953                 : 
     954                 :             /*
     955                 :              * Nope, so check conditions (b) and (c) enumerated above
     956                 :              *
     957                 :              * The earlier _bt_check_unique() call may well have established a
     958                 :              * strict upper bound on the offset for the new item.  If it's not
     959                 :              * the last item of the page (i.e. if there is at least one tuple
     960                 :              * on the page that's greater than the tuple we're inserting to)
     961                 :              * then we know that the tuple belongs on this page.  We can skip
     962                 :              * the high key check.
     963                 :              */
     964 UIC           0 :             if (insertstate->bounds_valid &&
     965               0 :                 insertstate->low <= insertstate->stricthigh &&
     966 UBC           0 :                 insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
     967               0 :                 break;
     968 EUB             : 
     969 UBC           0 :             if (P_RIGHTMOST(opaque) ||
     970 UIC           0 :                 _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
     971 UBC           0 :                 pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
     972 EUB             :                 break;
     973                 : 
     974 UNC           0 :             _bt_stepright(rel, heapRel, insertstate, stack);
     975                 :             /* Update local state after stepping right */
     976 UBC           0 :             page = BufferGetPage(insertstate->buf);
     977 UIC           0 :             opaque = BTPageGetOpaque(page);
     978 EUB             :         }
     979                 :     }
     980                 : 
     981                 :     /*
     982                 :      * We should now be on the correct page.  Find the offset within the page
     983                 :      * for the new tuple. (Possibly reusing earlier search bounds.)
     984                 :      */
     985 GIC     7166874 :     Assert(P_RIGHTMOST(opaque) ||
     986                 :            _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
     987 ECB             : 
     988 GIC     7166874 :     newitemoff = _bt_binsrch_insert(rel, insertstate);
     989                 : 
     990 CBC     7166874 :     if (insertstate->postingoff == -1)
     991                 :     {
     992 ECB             :         /*
     993                 :          * There is an overlapping posting list tuple with its LP_DEAD bit
     994                 :          * set.  We don't want to unnecessarily unset its LP_DEAD bit while
     995                 :          * performing a posting list split, so perform simple index tuple
     996                 :          * deletion early.
     997                 :          */
     998 UIC           0 :         _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
     999                 :                                      false, false, false);
    1000 EUB             : 
    1001                 :         /*
    1002                 :          * Do new binary search.  New insert location cannot overlap with any
    1003                 :          * posting list now.
    1004                 :          */
    1005 UIC           0 :         Assert(!insertstate->bounds_valid);
    1006               0 :         insertstate->postingoff = 0;
    1007 UBC           0 :         newitemoff = _bt_binsrch_insert(rel, insertstate);
    1008               0 :         Assert(insertstate->postingoff == 0);
    1009 EUB             :     }
    1010                 : 
    1011 GIC     7166874 :     return newitemoff;
    1012                 : }
    1013 ECB             : 
    1014                 : /*
    1015                 :  * Step right to next non-dead page, during insertion.
    1016                 :  *
    1017                 :  * This is a bit more complicated than moving right in a search.  We must
    1018                 :  * write-lock the target page before releasing write lock on current page;
    1019                 :  * else someone else's _bt_check_unique scan could fail to see our insertion.
    1020                 :  * Write locks on intermediate dead pages won't do because we don't know when
    1021                 :  * they will get de-linked from the tree.
    1022                 :  *
    1023                 :  * This is more aggressive than it needs to be for non-unique !heapkeyspace
    1024                 :  * indexes.
    1025                 :  */
    1026                 : static void
    1027 GNC        5022 : _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack)
    1028                 : {
    1029 ECB             :     Page        page;
    1030                 :     BTPageOpaque opaque;
    1031                 :     Buffer      rbuf;
    1032                 :     BlockNumber rblkno;
    1033                 : 
    1034 GIC        5022 :     page = BufferGetPage(insertstate->buf);
    1035            5022 :     opaque = BTPageGetOpaque(page);
    1036 ECB             : 
    1037 CBC        5022 :     rbuf = InvalidBuffer;
    1038 GIC        5022 :     rblkno = opaque->btpo_next;
    1039 ECB             :     for (;;)
    1040                 :     {
    1041 GIC        5022 :         rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
    1042            5022 :         page = BufferGetPage(rbuf);
    1043 CBC        5022 :         opaque = BTPageGetOpaque(page);
    1044 ECB             : 
    1045                 :         /*
    1046                 :          * If this page was incompletely split, finish the split now.  We do
    1047                 :          * this while holding a lock on the left sibling, which is not good
    1048                 :          * because finishing the split could be a fairly lengthy operation.
    1049                 :          * But this should happen very seldom.
    1050                 :          */
    1051 GIC        5022 :         if (P_INCOMPLETE_SPLIT(opaque))
    1052                 :         {
    1053 UNC           0 :             _bt_finish_split(rel, heaprel, rbuf, stack);
    1054 UIC           0 :             rbuf = InvalidBuffer;
    1055 UBC           0 :             continue;
    1056 EUB             :         }
    1057                 : 
    1058 GIC        5022 :         if (!P_IGNORE(opaque))
    1059            5022 :             break;
    1060 LBC           0 :         if (P_RIGHTMOST(opaque))
    1061               0 :             elog(ERROR, "fell off the end of index \"%s\"",
    1062 EUB             :                  RelationGetRelationName(rel));
    1063                 : 
    1064 UIC           0 :         rblkno = opaque->btpo_next;
    1065                 :     }
    1066 EUB             :     /* rbuf locked; unlock buf, update state for caller */
    1067 GIC        5022 :     _bt_relbuf(rel, insertstate->buf);
    1068            5022 :     insertstate->buf = rbuf;
    1069 CBC        5022 :     insertstate->bounds_valid = false;
    1070            5022 : }
    1071 ECB             : 
    1072                 : /*----------
    1073                 :  *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
    1074                 :  *
    1075                 :  *      This recursive procedure does the following things:
    1076                 :  *
    1077                 :  *          +  if postingoff != 0, splits existing posting list tuple
    1078                 :  *             (since it overlaps with new 'itup' tuple).
    1079                 :  *          +  if necessary, splits the target page, using 'itup_key' for
    1080                 :  *             suffix truncation on leaf pages (caller passes NULL for
    1081                 :  *             non-leaf pages).
    1082                 :  *          +  inserts the new tuple (might be split from posting list).
    1083                 :  *          +  if the page was split, pops the parent stack, and finds the
    1084                 :  *             right place to insert the new child pointer (by walking
    1085                 :  *             right using information stored in the parent stack).
    1086                 :  *          +  invokes itself with the appropriate tuple for the right
    1087                 :  *             child page on the parent.
    1088                 :  *          +  updates the metapage if a true root or fast root is split.
    1089                 :  *
    1090                 :  *      On entry, we must have the correct buffer in which to do the
    1091                 :  *      insertion, and the buffer must be pinned and write-locked.  On return,
    1092                 :  *      we will have dropped both the pin and the lock on the buffer.
    1093                 :  *
    1094                 :  *      This routine only performs retail tuple insertions.  'itup' should
    1095                 :  *      always be either a non-highkey leaf item, or a downlink (new high
    1096                 :  *      key items are created indirectly, when a page is split).  When
    1097                 :  *      inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
    1098                 :  *      we're inserting the downlink for.  This function will clear the
    1099                 :  *      INCOMPLETE_SPLIT flag on it, and release the buffer.
    1100                 :  *----------
    1101                 :  */
    1102                 : static void
    1103 GIC     7187735 : _bt_insertonpg(Relation rel,
    1104                 :                Relation heaprel,
    1105                 :                BTScanInsert itup_key,
    1106 ECB             :                Buffer buf,
    1107                 :                Buffer cbuf,
    1108                 :                BTStack stack,
    1109                 :                IndexTuple itup,
    1110                 :                Size itemsz,
    1111                 :                OffsetNumber newitemoff,
    1112                 :                int postingoff,
    1113                 :                bool split_only_page)
    1114                 : {
    1115                 :     Page        page;
    1116                 :     BTPageOpaque opaque;
    1117                 :     bool        isleaf,
    1118                 :                 isroot,
    1119                 :                 isrightmost,
    1120                 :                 isonly;
    1121 GIC     7187735 :     IndexTuple  oposting = NULL;
    1122         7187735 :     IndexTuple  origitup = NULL;
    1123         7187735 :     IndexTuple  nposting = NULL;
    1124 ECB             : 
    1125 CBC     7187735 :     page = BufferGetPage(buf);
    1126         7187735 :     opaque = BTPageGetOpaque(page);
    1127 GIC     7187735 :     isleaf = P_ISLEAF(opaque);
    1128 CBC     7187735 :     isroot = P_ISROOT(opaque);
    1129         7187735 :     isrightmost = P_RIGHTMOST(opaque);
    1130         7187735 :     isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
    1131 ECB             : 
    1132                 :     /* child buffer must be given iff inserting on an internal page */
    1133 CBC     7187735 :     Assert(isleaf == !BufferIsValid(cbuf));
    1134                 :     /* tuple must have appropriate number of attributes */
    1135 GIC     7187735 :     Assert(!isleaf ||
    1136 ECB             :            BTreeTupleGetNAtts(itup, rel) ==
    1137                 :            IndexRelationGetNumberOfAttributes(rel));
    1138 CBC     7187735 :     Assert(isleaf ||
    1139                 :            BTreeTupleGetNAtts(itup, rel) <=
    1140                 :            IndexRelationGetNumberOfKeyAttributes(rel));
    1141         7187735 :     Assert(!BTreeTupleIsPosting(itup));
    1142 GIC     7187735 :     Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
    1143                 :     /* Caller must always finish incomplete split for us */
    1144 CBC     7187735 :     Assert(!P_INCOMPLETE_SPLIT(opaque));
    1145 ECB             : 
    1146                 :     /*
    1147                 :      * Every internal page should have exactly one negative infinity item at
    1148                 :      * all times.  Only _bt_split() and _bt_newroot() should add items that
    1149                 :      * become negative infinity items through truncation, since they're the
    1150                 :      * only routines that allocate new internal pages.
    1151                 :      */
    1152 GIC     7187735 :     Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
    1153                 : 
    1154                 :     /*
    1155 ECB             :      * Do we need to split an existing posting list item?
    1156                 :      */
    1157 GIC     7187735 :     if (postingoff != 0)
    1158                 :     {
    1159            7766 :         ItemId      itemid = PageGetItemId(page, newitemoff);
    1160 ECB             : 
    1161                 :         /*
    1162                 :          * The new tuple is a duplicate with a heap TID that falls inside the
    1163                 :          * range of an existing posting list tuple on a leaf page.  Prepare to
    1164                 :          * split an existing posting list.  Overwriting the posting list with
    1165                 :          * its post-split version is treated as an extra step in either the
    1166                 :          * insert or page split critical section.
    1167                 :          */
    1168 GIC        7766 :         Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
    1169            7766 :         oposting = (IndexTuple) PageGetItem(page, itemid);
    1170                 : 
    1171 ECB             :         /*
    1172                 :          * postingoff value comes from earlier call to _bt_binsrch_posting().
    1173                 :          * Its binary search might think that a plain tuple must be a posting
    1174                 :          * list tuple that needs to be split.  This can happen with corruption
    1175                 :          * involving an existing plain tuple that is a duplicate of the new
    1176                 :          * item, up to and including its table TID.  Check for that here in
    1177                 :          * passing.
    1178                 :          *
    1179                 :          * Also verify that our caller has made sure that the existing posting
    1180                 :          * list tuple does not have its LP_DEAD bit set.
    1181                 :          */
    1182 GIC        7766 :         if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
    1183 UIC           0 :             ereport(ERROR,
    1184                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1185 ECB             :                      errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
    1186 EUB             :                                      ItemPointerGetBlockNumber(&itup->t_tid),
    1187                 :                                      ItemPointerGetOffsetNumber(&itup->t_tid),
    1188                 :                                      newitemoff, BufferGetBlockNumber(buf),
    1189                 :                                      RelationGetRelationName(rel))));
    1190                 : 
    1191                 :         /* use a mutable copy of itup as our itup from here on */
    1192 GIC        7766 :         origitup = itup;
    1193            7766 :         itup = CopyIndexTuple(origitup);
    1194            7766 :         nposting = _bt_swap_posting(itup, oposting, postingoff);
    1195 ECB             :         /* itup now contains rightmost/max TID from oposting */
    1196                 : 
    1197                 :         /* Alter offset so that newitem goes after posting list */
    1198 GIC        7766 :         newitemoff = OffsetNumberNext(newitemoff);
    1199                 :     }
    1200                 : 
    1201 ECB             :     /*
    1202                 :      * Do we need to split the page to fit the item on it?
    1203                 :      *
    1204                 :      * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
    1205                 :      * so this comparison is correct even though we appear to be accounting
    1206                 :      * only for the item and not for its line pointer.
    1207                 :      */
    1208 GIC     7187735 :     if (PageGetFreeSpace(page) < itemsz)
    1209                 :     {
    1210                 :         Buffer      rbuf;
    1211 ECB             : 
    1212 GIC       23479 :         Assert(!split_only_page);
    1213                 : 
    1214                 :         /* split the buffer into left and right halves */
    1215 GNC       23479 :         rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
    1216                 :                          itup, origitup, nposting, postingoff);
    1217 GIC       23479 :         PredicateLockPageSplit(rel,
    1218 ECB             :                                BufferGetBlockNumber(buf),
    1219                 :                                BufferGetBlockNumber(rbuf));
    1220                 : 
    1221                 :         /*----------
    1222                 :          * By here,
    1223                 :          *
    1224                 :          *      +  our target page has been split;
    1225                 :          *      +  the original tuple has been inserted;
    1226                 :          *      +  we have write locks on both the old (left half)
    1227                 :          *         and new (right half) buffers, after the split; and
    1228                 :          *      +  we know the key we want to insert into the parent
    1229                 :          *         (it's the "high key" on the left child page).
    1230                 :          *
    1231                 :          * We're ready to do the parent insertion.  We need to hold onto the
    1232                 :          * locks for the child pages until we locate the parent, but we can
    1233                 :          * at least release the lock on the right child before doing the
    1234                 :          * actual insertion.  The lock on the left child will be released
    1235                 :          * last of all by parent insertion, where it is the 'cbuf' of parent
    1236                 :          * page.
    1237                 :          *----------
    1238                 :          */
    1239 GNC       23479 :         _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
    1240                 :     }
    1241                 :     else
    1242 ECB             :     {
    1243 GIC     7164256 :         Buffer      metabuf = InvalidBuffer;
    1244         7164256 :         Page        metapg = NULL;
    1245         7164256 :         BTMetaPageData *metad = NULL;
    1246 ECB             :         BlockNumber blockcache;
    1247                 : 
    1248                 :         /*
    1249                 :          * If we are doing this insert because we split a page that was the
    1250                 :          * only one on its tree level, but was not the root, it may have been
    1251                 :          * the "fast root".  We need to ensure that the fast root link points
    1252                 :          * at or above the current page.  We can safely acquire a lock on the
    1253                 :          * metapage here --- see comments for _bt_newroot().
    1254                 :          */
    1255 GIC     7164256 :         if (unlikely(split_only_page))
    1256                 :         {
    1257              12 :             Assert(!isleaf);
    1258 CBC          12 :             Assert(BufferIsValid(cbuf));
    1259                 : 
    1260 GNC          12 :             metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
    1261 CBC          12 :             metapg = BufferGetPage(metabuf);
    1262 GIC          12 :             metad = BTPageGetMeta(metapg);
    1263 ECB             : 
    1264 CBC          12 :             if (metad->btm_fastlevel >= opaque->btpo_level)
    1265 ECB             :             {
    1266                 :                 /* no update wanted */
    1267 LBC           0 :                 _bt_relbuf(rel, metabuf);
    1268 UIC           0 :                 metabuf = InvalidBuffer;
    1269                 :             }
    1270 EUB             :         }
    1271                 : 
    1272                 :         /* Do the update.  No ereport(ERROR) until changes are logged */
    1273 GIC     7164256 :         START_CRIT_SECTION();
    1274                 : 
    1275         7164256 :         if (postingoff != 0)
    1276 CBC        7740 :             memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
    1277                 : 
    1278         7164256 :         if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
    1279 ECB             :                         false) == InvalidOffsetNumber)
    1280 UIC           0 :             elog(PANIC, "failed to add new item to block %u in index \"%s\"",
    1281 ECB             :                  BufferGetBlockNumber(buf), RelationGetRelationName(rel));
    1282                 : 
    1283 GBC     7164256 :         MarkBufferDirty(buf);
    1284                 : 
    1285 GIC     7164256 :         if (BufferIsValid(metabuf))
    1286 ECB             :         {
    1287                 :             /* upgrade meta-page if needed */
    1288 CBC          12 :             if (metad->btm_version < BTREE_NOVAC_VERSION)
    1289 UIC           0 :                 _bt_upgrademetapage(metapg);
    1290 GIC          12 :             metad->btm_fastroot = BufferGetBlockNumber(buf);
    1291 CBC          12 :             metad->btm_fastlevel = opaque->btpo_level;
    1292 GBC          12 :             MarkBufferDirty(metabuf);
    1293 ECB             :         }
    1294                 : 
    1295                 :         /*
    1296                 :          * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
    1297                 :          * finishes a split
    1298                 :          */
    1299 GIC     7164256 :         if (!isleaf)
    1300                 :         {
    1301           20760 :             Page        cpage = BufferGetPage(cbuf);
    1302 CBC       20760 :             BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1303                 : 
    1304           20760 :             Assert(P_INCOMPLETE_SPLIT(cpageop));
    1305           20760 :             cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1306 GIC       20760 :             MarkBufferDirty(cbuf);
    1307 ECB             :         }
    1308                 : 
    1309                 :         /* XLOG stuff */
    1310 GIC     7164256 :         if (RelationNeedsWAL(rel))
    1311                 :         {
    1312                 :             xl_btree_insert xlrec;
    1313 ECB             :             xl_btree_metadata xlmeta;
    1314                 :             uint8       xlinfo;
    1315                 :             XLogRecPtr  recptr;
    1316                 :             uint16      upostingoff;
    1317                 : 
    1318 GIC     6844337 :             xlrec.offnum = newitemoff;
    1319                 : 
    1320         6844337 :             XLogBeginInsert();
    1321 CBC     6844337 :             XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
    1322                 : 
    1323         6844337 :             if (isleaf && postingoff == 0)
    1324 ECB             :             {
    1325                 :                 /* Simple leaf insert */
    1326 CBC     6816655 :                 xlinfo = XLOG_BTREE_INSERT_LEAF;
    1327                 :             }
    1328 GIC       27682 :             else if (postingoff != 0)
    1329 ECB             :             {
    1330                 :                 /*
    1331                 :                  * Leaf insert with posting list split.  Must include
    1332                 :                  * postingoff field before newitem/orignewitem.
    1333                 :                  */
    1334 GIC        7740 :                 Assert(isleaf);
    1335            7740 :                 xlinfo = XLOG_BTREE_INSERT_POST;
    1336                 :             }
    1337 ECB             :             else
    1338                 :             {
    1339                 :                 /* Internal page insert, which finishes a split on cbuf */
    1340 GIC       19942 :                 xlinfo = XLOG_BTREE_INSERT_UPPER;
    1341           19942 :                 XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
    1342                 : 
    1343 CBC       19942 :                 if (BufferIsValid(metabuf))
    1344 ECB             :                 {
    1345                 :                     /* Actually, it's an internal page insert + meta update */
    1346 CBC          12 :                     xlinfo = XLOG_BTREE_INSERT_META;
    1347                 : 
    1348 GIC          12 :                     Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    1349 CBC          12 :                     xlmeta.version = metad->btm_version;
    1350 GIC          12 :                     xlmeta.root = metad->btm_root;
    1351 CBC          12 :                     xlmeta.level = metad->btm_level;
    1352              12 :                     xlmeta.fastroot = metad->btm_fastroot;
    1353              12 :                     xlmeta.fastlevel = metad->btm_fastlevel;
    1354              12 :                     xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    1355              12 :                     xlmeta.allequalimage = metad->btm_allequalimage;
    1356 ECB             : 
    1357 CBC          12 :                     XLogRegisterBuffer(2, metabuf,
    1358 ECB             :                                        REGBUF_WILL_INIT | REGBUF_STANDARD);
    1359 GIC          12 :                     XLogRegisterBufData(2, (char *) &xlmeta,
    1360 ECB             :                                         sizeof(xl_btree_metadata));
    1361                 :                 }
    1362                 :             }
    1363                 : 
    1364 GIC     6844337 :             XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1365         6844337 :             if (postingoff == 0)
    1366                 :             {
    1367 ECB             :                 /* Just log itup from caller */
    1368 CBC     6836597 :                 XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
    1369                 :             }
    1370                 :             else
    1371 ECB             :             {
    1372                 :                 /*
    1373                 :                  * Insert with posting list split (XLOG_BTREE_INSERT_POST
    1374                 :                  * record) case.
    1375                 :                  *
    1376                 :                  * Log postingoff.  Also log origitup, not itup.  REDO routine
    1377                 :                  * must reconstruct final itup (as well as nposting) using
    1378                 :                  * _bt_swap_posting().
    1379                 :                  */
    1380 GIC        7740 :                 upostingoff = postingoff;
    1381                 : 
    1382            7740 :                 XLogRegisterBufData(0, (char *) &upostingoff, sizeof(uint16));
    1383 CBC        7740 :                 XLogRegisterBufData(0, (char *) origitup,
    1384 GIC        7740 :                                     IndexTupleSize(origitup));
    1385 ECB             :             }
    1386                 : 
    1387 CBC     6844337 :             recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    1388                 : 
    1389 GIC     6844337 :             if (BufferIsValid(metabuf))
    1390 CBC          12 :                 PageSetLSN(metapg, recptr);
    1391 GIC     6844337 :             if (!isleaf)
    1392 CBC       19942 :                 PageSetLSN(BufferGetPage(cbuf), recptr);
    1393 ECB             : 
    1394 CBC     6844337 :             PageSetLSN(page, recptr);
    1395 ECB             :         }
    1396                 : 
    1397 CBC     7164256 :         END_CRIT_SECTION();
    1398                 : 
    1399                 :         /* Release subsidiary buffers */
    1400         7164256 :         if (BufferIsValid(metabuf))
    1401 GIC          12 :             _bt_relbuf(rel, metabuf);
    1402         7164256 :         if (!isleaf)
    1403 CBC       20760 :             _bt_relbuf(rel, cbuf);
    1404 ECB             : 
    1405                 :         /*
    1406                 :          * Cache the block number if this is the rightmost leaf page.  Cache
    1407                 :          * may be used by a future inserter within _bt_search_insert().
    1408                 :          */
    1409 GIC     7164256 :         blockcache = InvalidBlockNumber;
    1410         7164256 :         if (isrightmost && isleaf && !isroot)
    1411         3449620 :             blockcache = BufferGetBlockNumber(buf);
    1412 ECB             : 
    1413                 :         /* Release buffer for insertion target block */
    1414 CBC     7164256 :         _bt_relbuf(rel, buf);
    1415                 : 
    1416                 :         /*
    1417 ECB             :          * If we decided to cache the insertion target block before releasing
    1418                 :          * its buffer lock, then cache it now.  Check the height of the tree
    1419                 :          * first, though.  We don't go for the optimization with small
    1420                 :          * indexes.  Defer final check to this point to ensure that we don't
    1421                 :          * call _bt_getrootheight while holding a buffer lock.
    1422                 :          */
    1423 GIC    10613876 :         if (BlockNumberIsValid(blockcache) &&
    1424 GNC     3449620 :             _bt_getrootheight(rel, heaprel) >= BTREE_FASTPATH_MIN_LEVEL)
    1425 GIC       34278 :             RelationSetTargetBlock(rel, blockcache);
    1426 ECB             :     }
    1427                 : 
    1428                 :     /* be tidy */
    1429 GIC     7187735 :     if (postingoff != 0)
    1430                 :     {
    1431                 :         /* itup is actually a modified copy of caller's original */
    1432 CBC        7766 :         pfree(nposting);
    1433 GIC        7766 :         pfree(itup);
    1434                 :     }
    1435 CBC     7187735 : }
    1436 ECB             : 
    1437                 : /*
    1438                 :  *  _bt_split() -- split a page in the btree.
    1439                 :  *
    1440                 :  *      On entry, buf is the page to split, and is pinned and write-locked.
    1441                 :  *      newitemoff etc. tell us about the new item that must be inserted
    1442                 :  *      along with the data from the original page.
    1443                 :  *
    1444                 :  *      itup_key is used for suffix truncation on leaf pages (internal
    1445                 :  *      page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
    1446                 :  *      is the left-sibling of the page we're inserting the downlink for.
    1447                 :  *      This function will clear the INCOMPLETE_SPLIT flag on it, and
    1448                 :  *      release the buffer.
    1449                 :  *
    1450                 :  *      orignewitem, nposting, and postingoff are needed when an insert of
    1451                 :  *      orignewitem results in both a posting list split and a page split.
    1452                 :  *      These extra posting list split details are used here in the same
    1453                 :  *      way as they are used in the more common case where a posting list
    1454                 :  *      split does not coincide with a page split.  We need to deal with
    1455                 :  *      posting list splits directly in order to ensure that everything
    1456                 :  *      that follows from the insert of orignewitem is handled as a single
    1457                 :  *      atomic operation (though caller's insert of a new pivot/downlink
    1458                 :  *      into parent page will still be a separate operation).  See
    1459                 :  *      nbtree/README for details on the design of posting list splits.
    1460                 :  *
    1461                 :  *      Returns the new right sibling of buf, pinned and write-locked.
    1462                 :  *      The pin and lock on buf are maintained.
    1463                 :  */
    1464                 : static Buffer
    1465 GNC       23479 : _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
    1466                 :           Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
    1467                 :           IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
    1468 ECB             : {
    1469                 :     Buffer      rbuf;
    1470                 :     Page        origpage;
    1471                 :     Page        leftpage,
    1472                 :                 rightpage;
    1473                 :     BlockNumber origpagenumber,
    1474                 :                 rightpagenumber;
    1475                 :     BTPageOpaque ropaque,
    1476                 :                 lopaque,
    1477                 :                 oopaque;
    1478 GIC       23479 :     Buffer      sbuf = InvalidBuffer;
    1479           23479 :     Page        spage = NULL;
    1480           23479 :     BTPageOpaque sopaque = NULL;
    1481 ECB             :     Size        itemsz;
    1482                 :     ItemId      itemid;
    1483                 :     IndexTuple  firstright,
    1484                 :                 lefthighkey;
    1485                 :     OffsetNumber firstrightoff;
    1486                 :     OffsetNumber afterleftoff,
    1487                 :                 afterrightoff,
    1488                 :                 minusinfoff;
    1489                 :     OffsetNumber origpagepostingoff;
    1490                 :     OffsetNumber maxoff;
    1491                 :     OffsetNumber i;
    1492                 :     bool        newitemonleft,
    1493                 :                 isleaf,
    1494                 :                 isrightmost;
    1495                 : 
    1496                 :     /*
    1497                 :      * origpage is the original page to be split.  leftpage is a temporary
    1498                 :      * buffer that receives the left-sibling data, which will be copied back
    1499                 :      * into origpage on success.  rightpage is the new page that will receive
    1500                 :      * the right-sibling data.
    1501                 :      *
    1502                 :      * leftpage is allocated after choosing a split point.  rightpage's new
    1503                 :      * buffer isn't acquired until after leftpage is initialized and has new
    1504                 :      * high key, the last point where splitting the page may fail (barring
    1505                 :      * corruption).  Failing before acquiring new buffer won't have lasting
    1506                 :      * consequences, since origpage won't have been modified and leftpage is
    1507                 :      * only workspace.
    1508                 :      */
    1509 GIC       23479 :     origpage = BufferGetPage(buf);
    1510           23479 :     oopaque = BTPageGetOpaque(origpage);
    1511           23479 :     isleaf = P_ISLEAF(oopaque);
    1512 CBC       23479 :     isrightmost = P_RIGHTMOST(oopaque);
    1513           23479 :     maxoff = PageGetMaxOffsetNumber(origpage);
    1514           23479 :     origpagenumber = BufferGetBlockNumber(buf);
    1515 ECB             : 
    1516                 :     /*
    1517                 :      * Choose a point to split origpage at.
    1518                 :      *
    1519                 :      * A split point can be thought of as a point _between_ two existing data
    1520                 :      * items on origpage (the lastleft and firstright tuples), provided you
    1521                 :      * pretend that the new item that didn't fit is already on origpage.
    1522                 :      *
    1523                 :      * Since origpage does not actually contain newitem, the representation of
    1524                 :      * split points needs to work with two boundary cases: splits where
    1525                 :      * newitem is lastleft, and splits where newitem is firstright.
    1526                 :      * newitemonleft resolves the ambiguity that would otherwise exist when
    1527                 :      * newitemoff == firstrightoff.  In all other cases it's clear which side
    1528                 :      * of the split every tuple goes on from context.  newitemonleft is
    1529                 :      * usually (but not always) redundant information.
    1530                 :      *
    1531                 :      * firstrightoff is supposed to be an origpage offset number, but it's
    1532                 :      * possible that its value will be maxoff+1, which is "past the end" of
    1533                 :      * origpage.  This happens in the rare case where newitem goes after all
    1534                 :      * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
    1535                 :      * origpage at the point that leaves newitem alone on new right page.  Any
    1536                 :      * "!newitemonleft && newitemoff == firstrightoff" split point makes
    1537                 :      * newitem the firstright tuple, though, so this case isn't a special
    1538                 :      * case.
    1539                 :      */
    1540 GIC       23479 :     firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
    1541                 :                                      newitem, &newitemonleft);
    1542                 : 
    1543 ECB             :     /* Allocate temp buffer for leftpage */
    1544 GIC       23479 :     leftpage = PageGetTempPage(origpage);
    1545           23479 :     _bt_pageinit(leftpage, BufferGetPageSize(buf));
    1546           23479 :     lopaque = BTPageGetOpaque(leftpage);
    1547 ECB             : 
    1548                 :     /*
    1549                 :      * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
    1550                 :      * and HAS_GARBAGE flags.
    1551                 :      */
    1552 GIC       23479 :     lopaque->btpo_flags = oopaque->btpo_flags;
    1553           23479 :     lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1554                 :     /* set flag in leftpage indicating that rightpage has no downlink yet */
    1555 CBC       23479 :     lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
    1556           23479 :     lopaque->btpo_prev = oopaque->btpo_prev;
    1557                 :     /* handle btpo_next after rightpage buffer acquired */
    1558           23479 :     lopaque->btpo_level = oopaque->btpo_level;
    1559 ECB             :     /* handle btpo_cycleid after rightpage buffer acquired */
    1560                 : 
    1561                 :     /*
    1562                 :      * Copy the original page's LSN into leftpage, which will become the
    1563                 :      * updated version of the page.  We need this because XLogInsert will
    1564                 :      * examine the LSN and possibly dump it in a page image.
    1565                 :      */
    1566 GIC       23479 :     PageSetLSN(leftpage, PageGetLSN(origpage));
    1567                 : 
    1568                 :     /*
    1569 ECB             :      * Determine page offset number of existing overlapped-with-orignewitem
    1570                 :      * posting list when it is necessary to perform a posting list split in
    1571                 :      * passing.  Note that newitem was already changed by caller (newitem no
    1572                 :      * longer has the orignewitem TID).
    1573                 :      *
    1574                 :      * This page offset number (origpagepostingoff) will be used to pretend
    1575                 :      * that the posting split has already taken place, even though the
    1576                 :      * required modifications to origpage won't occur until we reach the
    1577                 :      * critical section.  The lastleft and firstright tuples of our page split
    1578                 :      * point should, in effect, come from an imaginary version of origpage
    1579                 :      * that has the nposting tuple instead of the original posting list tuple.
    1580                 :      *
    1581                 :      * Note: _bt_findsplitloc() should have compensated for coinciding posting
    1582                 :      * list splits in just the same way, at least in theory.  It doesn't
    1583                 :      * bother with that, though.  In practice it won't affect its choice of
    1584                 :      * split point.
    1585                 :      */
    1586 GIC       23479 :     origpagepostingoff = InvalidOffsetNumber;
    1587           23479 :     if (postingoff != 0)
    1588                 :     {
    1589 CBC          26 :         Assert(isleaf);
    1590              26 :         Assert(ItemPointerCompare(&orignewitem->t_tid,
    1591                 :                                   &newitem->t_tid) < 0);
    1592              26 :         Assert(BTreeTupleIsPosting(nposting));
    1593              26 :         origpagepostingoff = OffsetNumberPrev(newitemoff);
    1594                 :     }
    1595 ECB             : 
    1596                 :     /*
    1597                 :      * The high key for the new left page is a possibly-truncated copy of
    1598                 :      * firstright on the leaf level (it's "firstright itself" on internal
    1599                 :      * pages; see !isleaf comments below).  This may seem to be contrary to
    1600                 :      * Lehman & Yao's approach of using a copy of lastleft as the new high key
    1601                 :      * when splitting on the leaf level.  It isn't, though.
    1602                 :      *
    1603                 :      * Suffix truncation will leave the left page's high key fully equal to
    1604                 :      * lastleft when lastleft and firstright are equal prior to heap TID (that
    1605                 :      * is, the tiebreaker TID value comes from lastleft).  It isn't actually
    1606                 :      * necessary for a new leaf high key to be a copy of lastleft for the L&Y
    1607                 :      * "subtree" invariant to hold.  It's sufficient to make sure that the new
    1608                 :      * leaf high key is strictly less than firstright, and greater than or
    1609                 :      * equal to (not necessarily equal to) lastleft.  In other words, when
    1610                 :      * suffix truncation isn't possible during a leaf page split, we take
    1611                 :      * L&Y's exact approach to generating a new high key for the left page.
    1612                 :      * (Actually, that is slightly inaccurate.  We don't just use a copy of
    1613                 :      * lastleft.  A tuple with all the keys from firstright but the max heap
    1614                 :      * TID from lastleft is used, to avoid introducing a special case.)
    1615                 :      */
    1616 GIC       23479 :     if (!newitemonleft && newitemoff == firstrightoff)
    1617                 :     {
    1618                 :         /* incoming tuple becomes firstright */
    1619 CBC          17 :         itemsz = newitemsz;
    1620 GIC          17 :         firstright = newitem;
    1621                 :     }
    1622 ECB             :     else
    1623                 :     {
    1624                 :         /* existing item at firstrightoff becomes firstright */
    1625 GIC       23462 :         itemid = PageGetItemId(origpage, firstrightoff);
    1626           23462 :         itemsz = ItemIdGetLength(itemid);
    1627           23462 :         firstright = (IndexTuple) PageGetItem(origpage, itemid);
    1628 CBC       23462 :         if (firstrightoff == origpagepostingoff)
    1629 LBC           0 :             firstright = nposting;
    1630 ECB             :     }
    1631                 : 
    1632 GBC       23479 :     if (isleaf)
    1633                 :     {
    1634                 :         IndexTuple  lastleft;
    1635 ECB             : 
    1636                 :         /* Attempt suffix truncation for leaf page splits */
    1637 GIC       23378 :         if (newitemonleft && newitemoff == firstrightoff)
    1638                 :         {
    1639                 :             /* incoming tuple becomes lastleft */
    1640 CBC         163 :             lastleft = newitem;
    1641                 :         }
    1642                 :         else
    1643 ECB             :         {
    1644                 :             OffsetNumber lastleftoff;
    1645                 : 
    1646                 :             /* existing item before firstrightoff becomes lastleft */
    1647 GIC       23215 :             lastleftoff = OffsetNumberPrev(firstrightoff);
    1648           23215 :             Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
    1649           23215 :             itemid = PageGetItemId(origpage, lastleftoff);
    1650 CBC       23215 :             lastleft = (IndexTuple) PageGetItem(origpage, itemid);
    1651           23215 :             if (lastleftoff == origpagepostingoff)
    1652               3 :                 lastleft = nposting;
    1653 ECB             :         }
    1654                 : 
    1655 CBC       23378 :         lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
    1656 GIC       23378 :         itemsz = IndexTupleSize(lefthighkey);
    1657                 :     }
    1658 ECB             :     else
    1659                 :     {
    1660                 :         /*
    1661                 :          * Don't perform suffix truncation on a copy of firstright to make
    1662                 :          * left page high key for internal page splits.  Must use firstright
    1663                 :          * as new high key directly.
    1664                 :          *
    1665                 :          * Each distinct separator key value originates as a leaf level high
    1666                 :          * key; all other separator keys/pivot tuples are copied from one
    1667                 :          * level down.  A separator key in a grandparent page must be
    1668                 :          * identical to high key in rightmost parent page of the subtree to
    1669                 :          * its left, which must itself be identical to high key in rightmost
    1670                 :          * child page of that same subtree (this even applies to separator
    1671                 :          * from grandparent's high key).  There must always be an unbroken
    1672                 :          * "seam" of identical separator keys that guide index scans at every
    1673                 :          * level, starting from the grandparent.  That's why suffix truncation
    1674                 :          * is unsafe here.
    1675                 :          *
    1676                 :          * Internal page splits will truncate firstright into a "negative
    1677                 :          * infinity" data item when it gets inserted on the new right page
    1678                 :          * below, though.  This happens during the call to _bt_pgaddtup() for
    1679                 :          * the new first data item for right page.  Do not confuse this
    1680                 :          * mechanism with suffix truncation.  It is just a convenient way of
    1681                 :          * implementing page splits that split the internal page "inside"
    1682                 :          * firstright.  The lefthighkey separator key cannot appear a second
    1683                 :          * time in the right page (only firstright's downlink goes in right
    1684                 :          * page).
    1685                 :          */
    1686 GIC         101 :         lefthighkey = firstright;
    1687                 :     }
    1688                 : 
    1689 ECB             :     /*
    1690                 :      * Add new high key to leftpage
    1691                 :      */
    1692 GIC       23479 :     afterleftoff = P_HIKEY;
    1693                 : 
    1694           23479 :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
    1695 CBC       23479 :     Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
    1696                 :            IndexRelationGetNumberOfKeyAttributes(rel));
    1697           23479 :     Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
    1698           23479 :     if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
    1699                 :                     false) == InvalidOffsetNumber)
    1700 LBC           0 :         elog(ERROR, "failed to add high key to the left sibling"
    1701 ECB             :              " while splitting block %u of index \"%s\"",
    1702                 :              origpagenumber, RelationGetRelationName(rel));
    1703 GBC       23479 :     afterleftoff = OffsetNumberNext(afterleftoff);
    1704                 : 
    1705                 :     /*
    1706 ECB             :      * Acquire a new right page to split into, now that left page has a new
    1707                 :      * high key.  From here on, it's not okay to throw an error without
    1708                 :      * zeroing rightpage first.  This coding rule ensures that we won't
    1709                 :      * confuse future VACUUM operations, which might otherwise try to re-find
    1710                 :      * a downlink to a leftover junk page as the page undergoes deletion.
    1711                 :      *
    1712                 :      * It would be reasonable to start the critical section just after the new
    1713                 :      * rightpage buffer is acquired instead; that would allow us to avoid
    1714                 :      * leftover junk pages without bothering to zero rightpage.  We do it this
    1715                 :      * way because it avoids an unnecessary PANIC when either origpage or its
    1716                 :      * existing sibling page are corrupt.
    1717                 :      */
    1718 GNC       23479 :     rbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
    1719 GIC       23479 :     rightpage = BufferGetPage(rbuf);
    1720           23479 :     rightpagenumber = BufferGetBlockNumber(rbuf);
    1721 ECB             :     /* rightpage was initialized by _bt_getbuf */
    1722 CBC       23479 :     ropaque = BTPageGetOpaque(rightpage);
    1723 ECB             : 
    1724                 :     /*
    1725                 :      * Finish off remaining leftpage special area fields.  They cannot be set
    1726                 :      * before both origpage (leftpage) and rightpage buffers are acquired and
    1727                 :      * locked.
    1728                 :      *
    1729                 :      * btpo_cycleid is only used with leaf pages, though we set it here in all
    1730                 :      * cases just to be consistent.
    1731                 :      */
    1732 GIC       23479 :     lopaque->btpo_next = rightpagenumber;
    1733           23479 :     lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
    1734                 : 
    1735 ECB             :     /*
    1736                 :      * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
    1737                 :      * and HAS_GARBAGE flags.
    1738                 :      */
    1739 GIC       23479 :     ropaque->btpo_flags = oopaque->btpo_flags;
    1740           23479 :     ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
    1741           23479 :     ropaque->btpo_prev = origpagenumber;
    1742 CBC       23479 :     ropaque->btpo_next = oopaque->btpo_next;
    1743           23479 :     ropaque->btpo_level = oopaque->btpo_level;
    1744           23479 :     ropaque->btpo_cycleid = lopaque->btpo_cycleid;
    1745 ECB             : 
    1746                 :     /*
    1747                 :      * Add new high key to rightpage where necessary.
    1748                 :      *
    1749                 :      * If the page we're splitting is not the rightmost page at its level in
    1750                 :      * the tree, then the first entry on the page is the high key from
    1751                 :      * origpage.
    1752                 :      */
    1753 GIC       23479 :     afterrightoff = P_HIKEY;
    1754                 : 
    1755           23479 :     if (!isrightmost)
    1756 ECB             :     {
    1757                 :         IndexTuple  righthighkey;
    1758                 : 
    1759 GIC       11299 :         itemid = PageGetItemId(origpage, P_HIKEY);
    1760           11299 :         itemsz = ItemIdGetLength(itemid);
    1761           11299 :         righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    1762 CBC       11299 :         Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
    1763           11299 :         Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
    1764 ECB             :                IndexRelationGetNumberOfKeyAttributes(rel));
    1765 CBC       11299 :         if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
    1766 ECB             :                         false, false) == InvalidOffsetNumber)
    1767                 :         {
    1768 LBC           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1769 UIC           0 :             elog(ERROR, "failed to add high key to the right sibling"
    1770                 :                  " while splitting block %u of index \"%s\"",
    1771 EUB             :                  origpagenumber, RelationGetRelationName(rel));
    1772                 :         }
    1773 GIC       11299 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1774                 :     }
    1775                 : 
    1776 ECB             :     /*
    1777                 :      * Internal page splits truncate first data item on right page -- it
    1778                 :      * becomes "minus infinity" item for the page.  Set this up here.
    1779                 :      */
    1780 GIC       23479 :     minusinfoff = InvalidOffsetNumber;
    1781           23479 :     if (!isleaf)
    1782             101 :         minusinfoff = afterrightoff;
    1783 ECB             : 
    1784                 :     /*
    1785                 :      * Now transfer all the data items (non-pivot tuples in isleaf case, or
    1786                 :      * additional pivot tuples in !isleaf case) to the appropriate page.
    1787                 :      *
    1788                 :      * Note: we *must* insert at least the right page's items in item-number
    1789                 :      * order, for the benefit of _bt_restore_page().
    1790                 :      */
    1791 GIC     6963551 :     for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
    1792                 :     {
    1793                 :         IndexTuple  dataitem;
    1794 ECB             : 
    1795 GIC     6940072 :         itemid = PageGetItemId(origpage, i);
    1796         6940072 :         itemsz = ItemIdGetLength(itemid);
    1797         6940072 :         dataitem = (IndexTuple) PageGetItem(origpage, itemid);
    1798 ECB             : 
    1799                 :         /* replace original item with nposting due to posting split? */
    1800 CBC     6940072 :         if (i == origpagepostingoff)
    1801                 :         {
    1802 GIC          26 :             Assert(BTreeTupleIsPosting(dataitem));
    1803 CBC          26 :             Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
    1804 GIC          26 :             dataitem = nposting;
    1805 ECB             :         }
    1806                 : 
    1807                 :         /* does new item belong before this one? */
    1808 GIC     6940046 :         else if (i == newitemoff)
    1809                 :         {
    1810           13783 :             if (newitemonleft)
    1811 ECB             :             {
    1812 GIC        4376 :                 Assert(newitemoff <= firstrightoff);
    1813 CBC        4376 :                 if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
    1814                 :                                   false))
    1815 ECB             :                 {
    1816 LBC           0 :                     memset(rightpage, 0, BufferGetPageSize(rbuf));
    1817 UIC           0 :                     elog(ERROR, "failed to add new item to the left sibling"
    1818                 :                          " while splitting block %u of index \"%s\"",
    1819 EUB             :                          origpagenumber, RelationGetRelationName(rel));
    1820                 :                 }
    1821 GIC        4376 :                 afterleftoff = OffsetNumberNext(afterleftoff);
    1822                 :             }
    1823                 :             else
    1824 ECB             :             {
    1825 GIC        9407 :                 Assert(newitemoff >= firstrightoff);
    1826            9407 :                 if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1827                 :                                   afterrightoff == minusinfoff))
    1828 ECB             :                 {
    1829 LBC           0 :                     memset(rightpage, 0, BufferGetPageSize(rbuf));
    1830 UIC           0 :                     elog(ERROR, "failed to add new item to the right sibling"
    1831                 :                          " while splitting block %u of index \"%s\"",
    1832 EUB             :                          origpagenumber, RelationGetRelationName(rel));
    1833                 :                 }
    1834 GIC        9407 :                 afterrightoff = OffsetNumberNext(afterrightoff);
    1835                 :             }
    1836                 :         }
    1837 ECB             : 
    1838                 :         /* decide which page to put it on */
    1839 GIC     6940072 :         if (i < firstrightoff)
    1840                 :         {
    1841         5140479 :             if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
    1842 ECB             :             {
    1843 UIC           0 :                 memset(rightpage, 0, BufferGetPageSize(rbuf));
    1844 LBC           0 :                 elog(ERROR, "failed to add old item to the left sibling"
    1845                 :                      " while splitting block %u of index \"%s\"",
    1846 EUB             :                      origpagenumber, RelationGetRelationName(rel));
    1847                 :             }
    1848 GIC     5140479 :             afterleftoff = OffsetNumberNext(afterleftoff);
    1849                 :         }
    1850                 :         else
    1851 ECB             :         {
    1852 GIC     1799593 :             if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
    1853                 :                               afterrightoff == minusinfoff))
    1854                 :             {
    1855 LBC           0 :                 memset(rightpage, 0, BufferGetPageSize(rbuf));
    1856 UIC           0 :                 elog(ERROR, "failed to add old item to the right sibling"
    1857                 :                      " while splitting block %u of index \"%s\"",
    1858 EUB             :                      origpagenumber, RelationGetRelationName(rel));
    1859                 :             }
    1860 GIC     1799593 :             afterrightoff = OffsetNumberNext(afterrightoff);
    1861                 :         }
    1862                 :     }
    1863 ECB             : 
    1864                 :     /* Handle case where newitem goes at the end of rightpage */
    1865 GIC       23479 :     if (i <= newitemoff)
    1866                 :     {
    1867                 :         /*
    1868 ECB             :          * Can't have newitemonleft here; that would imply we were told to put
    1869                 :          * *everything* on the left page, which cannot fit (if it could, we'd
    1870                 :          * not be splitting the page).
    1871                 :          */
    1872 GIC        9696 :         Assert(!newitemonleft && newitemoff == maxoff + 1);
    1873            9696 :         if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
    1874                 :                           afterrightoff == minusinfoff))
    1875 ECB             :         {
    1876 LBC           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1877 UIC           0 :             elog(ERROR, "failed to add new item to the right sibling"
    1878                 :                  " while splitting block %u of index \"%s\"",
    1879 EUB             :                  origpagenumber, RelationGetRelationName(rel));
    1880                 :         }
    1881 GIC        9696 :         afterrightoff = OffsetNumberNext(afterrightoff);
    1882                 :     }
    1883                 : 
    1884 ECB             :     /*
    1885                 :      * We have to grab the original right sibling (if any) and update its prev
    1886                 :      * link.  We are guaranteed that this is deadlock-free, since we couple
    1887                 :      * the locks in the standard order: left to right.
    1888                 :      */
    1889 GIC       23479 :     if (!isrightmost)
    1890                 :     {
    1891 GNC       11299 :         sbuf = _bt_getbuf(rel, heaprel, oopaque->btpo_next, BT_WRITE);
    1892 CBC       11299 :         spage = BufferGetPage(sbuf);
    1893 GIC       11299 :         sopaque = BTPageGetOpaque(spage);
    1894 CBC       11299 :         if (sopaque->btpo_prev != origpagenumber)
    1895 ECB             :         {
    1896 LBC           0 :             memset(rightpage, 0, BufferGetPageSize(rbuf));
    1897               0 :             ereport(ERROR,
    1898                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    1899 EUB             :                      errmsg_internal("right sibling's left-link doesn't match: "
    1900                 :                                      "block %u links to %u instead of expected %u in index \"%s\"",
    1901                 :                                      oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
    1902                 :                                      RelationGetRelationName(rel))));
    1903                 :         }
    1904                 : 
    1905                 :         /*
    1906                 :          * Check to see if we can set the SPLIT_END flag in the right-hand
    1907                 :          * split page; this can save some I/O for vacuum since it need not
    1908                 :          * proceed to the right sibling.  We can set the flag if the right
    1909                 :          * sibling has a different cycleid: that means it could not be part of
    1910                 :          * a group of pages that were all split off from the same ancestor
    1911                 :          * page.  If you're confused, imagine that page A splits to A B and
    1912                 :          * then again, yielding A C B, while vacuum is in progress.  Tuples
    1913                 :          * originally in A could now be in either B or C, hence vacuum must
    1914                 :          * examine both pages.  But if D, our right sibling, has a different
    1915                 :          * cycleid then it could not contain any tuples that were in A when
    1916                 :          * the vacuum started.
    1917                 :          */
    1918 GIC       11299 :         if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
    1919 UIC           0 :             ropaque->btpo_flags |= BTP_SPLIT_END;
    1920                 :     }
    1921 ECB             : 
    1922 EUB             :     /*
    1923                 :      * Right sibling is locked, new siblings are prepared, but original page
    1924                 :      * is not updated yet.
    1925                 :      *
    1926                 :      * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
    1927                 :      * not starting the critical section till here because we haven't been
    1928                 :      * scribbling on the original page yet; see comments above.
    1929                 :      */
    1930 GIC       23479 :     START_CRIT_SECTION();
    1931                 : 
    1932                 :     /*
    1933 ECB             :      * By here, the original data page has been split into two new halves, and
    1934                 :      * these are correct.  The algorithm requires that the left page never
    1935                 :      * move during a split, so we copy the new left page back on top of the
    1936                 :      * original.  We need to do this before writing the WAL record, so that
    1937                 :      * XLogInsert can WAL log an image of the page if necessary.
    1938                 :      */
    1939 GIC       23479 :     PageRestoreTempPage(leftpage, origpage);
    1940                 :     /* leftpage, lopaque must not be used below here */
    1941                 : 
    1942 CBC       23479 :     MarkBufferDirty(buf);
    1943 GIC       23479 :     MarkBufferDirty(rbuf);
    1944                 : 
    1945 CBC       23479 :     if (!isrightmost)
    1946 ECB             :     {
    1947 GIC       11299 :         sopaque->btpo_prev = rightpagenumber;
    1948 CBC       11299 :         MarkBufferDirty(sbuf);
    1949                 :     }
    1950 ECB             : 
    1951                 :     /*
    1952                 :      * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
    1953                 :      * a split
    1954                 :      */
    1955 GIC       23479 :     if (!isleaf)
    1956                 :     {
    1957             101 :         Page        cpage = BufferGetPage(cbuf);
    1958 CBC         101 :         BTPageOpaque cpageop = BTPageGetOpaque(cpage);
    1959                 : 
    1960             101 :         cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    1961             101 :         MarkBufferDirty(cbuf);
    1962                 :     }
    1963 ECB             : 
    1964                 :     /* XLOG stuff */
    1965 GIC       23479 :     if (RelationNeedsWAL(rel))
    1966                 :     {
    1967                 :         xl_btree_split xlrec;
    1968 ECB             :         uint8       xlinfo;
    1969                 :         XLogRecPtr  recptr;
    1970                 : 
    1971 GIC       22648 :         xlrec.level = ropaque->btpo_level;
    1972                 :         /* See comments below on newitem, orignewitem, and posting lists */
    1973           22648 :         xlrec.firstrightoff = firstrightoff;
    1974 CBC       22648 :         xlrec.newitemoff = newitemoff;
    1975 GIC       22648 :         xlrec.postingoff = 0;
    1976 CBC       22648 :         if (postingoff != 0 && origpagepostingoff < firstrightoff)
    1977              17 :             xlrec.postingoff = postingoff;
    1978 ECB             : 
    1979 CBC       22648 :         XLogBeginInsert();
    1980           22648 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeSplit);
    1981                 : 
    1982           22648 :         XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
    1983           22648 :         XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
    1984                 :         /* Log original right sibling, since we've changed its prev-pointer */
    1985           22648 :         if (!isrightmost)
    1986           11293 :             XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
    1987 GIC       22648 :         if (!isleaf)
    1988 CBC         101 :             XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
    1989 ECB             : 
    1990                 :         /*
    1991                 :          * Log the new item, if it was inserted on the left page. (If it was
    1992                 :          * put on the right page, we don't need to explicitly WAL log it
    1993                 :          * because it's included with all the other items on the right page.)
    1994                 :          * Show the new item as belonging to the left page buffer, so that it
    1995                 :          * is not stored if XLogInsert decides it needs a full-page image of
    1996                 :          * the left page.  We always store newitemoff in the record, though.
    1997                 :          *
    1998                 :          * The details are sometimes slightly different for page splits that
    1999                 :          * coincide with a posting list split.  If both the replacement
    2000                 :          * posting list and newitem go on the right page, then we don't need
    2001                 :          * to log anything extra, just like the simple !newitemonleft
    2002                 :          * no-posting-split case (postingoff is set to zero in the WAL record,
    2003                 :          * so recovery doesn't need to process a posting list split at all).
    2004                 :          * Otherwise, we set postingoff and log orignewitem instead of
    2005                 :          * newitem, despite having actually inserted newitem.  REDO routine
    2006                 :          * must reconstruct nposting and newitem using _bt_swap_posting().
    2007                 :          *
    2008                 :          * Note: It's possible that our page split point is the point that
    2009                 :          * makes the posting list lastleft and newitem firstright.  This is
    2010                 :          * the only case where we log orignewitem/newitem despite newitem
    2011                 :          * going on the right page.  If XLogInsert decides that it can omit
    2012                 :          * orignewitem due to logging a full-page image of the left page,
    2013                 :          * everything still works out, since recovery only needs to log
    2014                 :          * orignewitem for items on the left page (just like the regular
    2015                 :          * newitem-logged case).
    2016                 :          */
    2017 GIC       22648 :         if (newitemonleft && xlrec.postingoff == 0)
    2018            4359 :             XLogRegisterBufData(0, (char *) newitem, newitemsz);
    2019           18289 :         else if (xlrec.postingoff != 0)
    2020 ECB             :         {
    2021 CBC          17 :             Assert(isleaf);
    2022              17 :             Assert(newitemonleft || firstrightoff == newitemoff);
    2023 GIC          17 :             Assert(newitemsz == IndexTupleSize(orignewitem));
    2024 CBC          17 :             XLogRegisterBufData(0, (char *) orignewitem, newitemsz);
    2025 ECB             :         }
    2026                 : 
    2027                 :         /* Log the left page's new high key */
    2028 GIC       22648 :         if (!isleaf)
    2029                 :         {
    2030                 :             /* lefthighkey isn't local copy, get current pointer */
    2031 CBC         101 :             itemid = PageGetItemId(origpage, P_HIKEY);
    2032 GIC         101 :             lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
    2033                 :         }
    2034 CBC       22648 :         XLogRegisterBufData(0, (char *) lefthighkey,
    2035           22648 :                             MAXALIGN(IndexTupleSize(lefthighkey)));
    2036                 : 
    2037 ECB             :         /*
    2038                 :          * Log the contents of the right page in the format understood by
    2039                 :          * _bt_restore_page().  The whole right page will be recreated.
    2040                 :          *
    2041                 :          * Direct access to page is not good but faster - we should implement
    2042                 :          * some new func in page API.  Note we only store the tuples
    2043                 :          * themselves, knowing that they were inserted in item-number order
    2044                 :          * and so the line pointers can be reconstructed.  See comments for
    2045                 :          * _bt_restore_page().
    2046                 :          */
    2047 GIC       22648 :         XLogRegisterBufData(1,
    2048           22648 :                             (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
    2049           22648 :                             ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
    2050 ECB             : 
    2051 CBC       22648 :         xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
    2052           22648 :         recptr = XLogInsert(RM_BTREE_ID, xlinfo);
    2053                 : 
    2054           22648 :         PageSetLSN(origpage, recptr);
    2055           22648 :         PageSetLSN(rightpage, recptr);
    2056 GIC       22648 :         if (!isrightmost)
    2057 CBC       11293 :             PageSetLSN(spage, recptr);
    2058           22648 :         if (!isleaf)
    2059             101 :             PageSetLSN(BufferGetPage(cbuf), recptr);
    2060 ECB             :     }
    2061                 : 
    2062 CBC       23479 :     END_CRIT_SECTION();
    2063                 : 
    2064                 :     /* release the old right sibling */
    2065           23479 :     if (!isrightmost)
    2066 GIC       11299 :         _bt_relbuf(rel, sbuf);
    2067                 : 
    2068 ECB             :     /* release the child */
    2069 CBC       23479 :     if (!isleaf)
    2070 GIC         101 :         _bt_relbuf(rel, cbuf);
    2071                 : 
    2072 ECB             :     /* be tidy */
    2073 CBC       23479 :     if (isleaf)
    2074 GIC       23378 :         pfree(lefthighkey);
    2075                 : 
    2076 ECB             :     /* split's done */
    2077 CBC       23479 :     return rbuf;
    2078                 : }
    2079                 : 
    2080 ECB             : /*
    2081                 :  * _bt_insert_parent() -- Insert downlink into parent, completing split.
    2082                 :  *
    2083                 :  * On entry, buf and rbuf are the left and right split pages, which we
    2084                 :  * still hold write locks on.  Both locks will be released here.  We
    2085                 :  * release the rbuf lock once we have a write lock on the page that we
    2086                 :  * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
    2087                 :  * The lock on buf is released at the same point as the lock on the parent
    2088                 :  * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
    2089                 :  * atomic operation that completes the split by inserting a new downlink.
    2090                 :  *
    2091                 :  * stack - stack showing how we got here.  Will be NULL when splitting true
    2092                 :  *          root, or during concurrent root split, where we can be inefficient
    2093                 :  * isroot - we split the true root
    2094                 :  * isonly - we split a page alone on its level (might have been fast root)
    2095                 :  */
    2096                 : static void
    2097 GIC       23479 : _bt_insert_parent(Relation rel,
    2098                 :                   Relation heaprel,
    2099                 :                   Buffer buf,
    2100                 :                   Buffer rbuf,
    2101 ECB             :                   BTStack stack,
    2102                 :                   bool isroot,
    2103                 :                   bool isonly)
    2104                 : {
    2105                 :     /*
    2106                 :      * Here we have to do something Lehman and Yao don't talk about: deal with
    2107                 :      * a root split and construction of a new root.  If our stack is empty
    2108                 :      * then we have just split a node on what had been the root level when we
    2109                 :      * descended the tree.  If it was still the root then we perform a
    2110                 :      * new-root construction.  If it *wasn't* the root anymore, search to find
    2111                 :      * the next higher level that someone constructed meanwhile, and find the
    2112                 :      * right place to insert as for the normal case.
    2113                 :      *
    2114                 :      * If we have to search for the parent level, we do so by re-descending
    2115                 :      * from the root.  This is not super-efficient, but it's rare enough not
    2116                 :      * to matter.
    2117                 :      */
    2118 GIC       23479 :     if (isroot)
    2119                 :     {
    2120                 :         Buffer      rootbuf;
    2121                 : 
    2122 CBC        2618 :         Assert(stack == NULL);
    2123 GIC        2618 :         Assert(isonly);
    2124                 :         /* create a new root node and update the metapage */
    2125 GNC        2618 :         rootbuf = _bt_newroot(rel, heaprel, buf, rbuf);
    2126 ECB             :         /* release the split buffers */
    2127 CBC        2618 :         _bt_relbuf(rel, rootbuf);
    2128 GIC        2618 :         _bt_relbuf(rel, rbuf);
    2129 CBC        2618 :         _bt_relbuf(rel, buf);
    2130                 :     }
    2131 ECB             :     else
    2132                 :     {
    2133 CBC       20861 :         BlockNumber bknum = BufferGetBlockNumber(buf);
    2134 GIC       20861 :         BlockNumber rbknum = BufferGetBlockNumber(rbuf);
    2135           20861 :         Page        page = BufferGetPage(buf);
    2136                 :         IndexTuple  new_item;
    2137 ECB             :         BTStackData fakestack;
    2138                 :         IndexTuple  ritem;
    2139                 :         Buffer      pbuf;
    2140                 : 
    2141 GIC       20861 :         if (stack == NULL)
    2142                 :         {
    2143                 :             BTPageOpaque opaque;
    2144                 : 
    2145 CBC          12 :             elog(DEBUG2, "concurrent ROOT page split");
    2146 GIC          12 :             opaque = BTPageGetOpaque(page);
    2147                 : 
    2148                 :             /*
    2149 ECB             :              * We should never reach here when a leaf page split takes place
    2150                 :              * despite the insert of newitem being able to apply the fastpath
    2151                 :              * optimization.  Make sure of that with an assertion.
    2152                 :              *
    2153                 :              * This is more of a performance issue than a correctness issue.
    2154                 :              * The fastpath won't have a descent stack.  Using a phony stack
    2155                 :              * here works, but never rely on that.  The fastpath should be
    2156                 :              * rejected within _bt_search_insert() when the rightmost leaf
    2157                 :              * page will split, since it's faster to go through _bt_search()
    2158                 :              * and get a stack in the usual way.
    2159                 :              */
    2160 GIC          12 :             Assert(!(P_ISLEAF(opaque) &&
    2161                 :                      BlockNumberIsValid(RelationGetTargetBlock(rel))));
    2162                 : 
    2163                 :             /* Find the leftmost page at the next level up */
    2164 GNC          12 :             pbuf = _bt_get_endpoint(rel, heaprel, opaque->btpo_level + 1, false,
    2165                 :                                     NULL);
    2166                 :             /* Set up a phony stack entry pointing there */
    2167 GIC          12 :             stack = &fakestack;
    2168              12 :             stack->bts_blkno = BufferGetBlockNumber(pbuf);
    2169 CBC          12 :             stack->bts_offset = InvalidOffsetNumber;
    2170 GIC          12 :             stack->bts_parent = NULL;
    2171              12 :             _bt_relbuf(rel, pbuf);
    2172 ECB             :         }
    2173                 : 
    2174                 :         /* get high key from left, a strict lower bound for new right page */
    2175 CBC       20861 :         ritem = (IndexTuple) PageGetItem(page,
    2176 ECB             :                                          PageGetItemId(page, P_HIKEY));
    2177                 : 
    2178                 :         /* form an index tuple that points at the new right page */
    2179 GIC       20861 :         new_item = CopyIndexTuple(ritem);
    2180 CBC       20861 :         BTreeTupleSetDownLink(new_item, rbknum);
    2181                 : 
    2182                 :         /*
    2183                 :          * Re-find and write lock the parent of buf.
    2184 ECB             :          *
    2185                 :          * It's possible that the location of buf's downlink has changed since
    2186                 :          * our initial _bt_search() descent.  _bt_getstackbuf() will detect
    2187                 :          * and recover from this, updating the stack, which ensures that the
    2188                 :          * new downlink will be inserted at the correct offset. Even buf's
    2189                 :          * parent may have changed.
    2190                 :          */
    2191 GNC       20861 :         pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
    2192                 : 
    2193                 :         /*
    2194                 :          * Unlock the right child.  The left child will be unlocked in
    2195                 :          * _bt_insertonpg().
    2196 ECB             :          *
    2197                 :          * Unlocking the right child must be delayed until here to ensure that
    2198                 :          * no concurrent VACUUM operation can become confused.  Page deletion
    2199                 :          * cannot be allowed to fail to re-find a downlink for the rbuf page.
    2200                 :          * (Actually, this is just a vestige of how things used to work.  The
    2201                 :          * page deletion code is expected to check for the INCOMPLETE_SPLIT
    2202                 :          * flag on the left child.  It won't attempt deletion of the right
    2203                 :          * child until the split is complete.  Despite all this, we opt to
    2204                 :          * conservatively delay unlocking the right child until here.)
    2205                 :          */
    2206 GIC       20861 :         _bt_relbuf(rel, rbuf);
    2207                 : 
    2208           20861 :         if (pbuf == InvalidBuffer)
    2209 UIC           0 :             ereport(ERROR,
    2210                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2211 ECB             :                      errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
    2212                 :                                      RelationGetRelationName(rel), bknum, rbknum)));
    2213                 : 
    2214 EUB             :         /* Recursively insert into the parent */
    2215 GNC       20861 :         _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
    2216 GIC       20861 :                        new_item, MAXALIGN(IndexTupleSize(new_item)),
    2217           20861 :                        stack->bts_offset + 1, 0, isonly);
    2218                 : 
    2219                 :         /* be tidy */
    2220 CBC       20861 :         pfree(new_item);
    2221 ECB             :     }
    2222 CBC       23479 : }
    2223                 : 
    2224                 : /*
    2225 ECB             :  * _bt_finish_split() -- Finish an incomplete split
    2226                 :  *
    2227                 :  * A crash or other failure can leave a split incomplete.  The insertion
    2228                 :  * routines won't allow to insert on a page that is incompletely split.
    2229                 :  * Before inserting on such a page, call _bt_finish_split().
    2230                 :  *
    2231                 :  * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
    2232                 :  * and unpinned.
    2233                 :  */
    2234                 : void
    2235 UNC           0 : _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
    2236                 : {
    2237 UIC           0 :     Page        lpage = BufferGetPage(lbuf);
    2238               0 :     BTPageOpaque lpageop = BTPageGetOpaque(lpage);
    2239                 :     Buffer      rbuf;
    2240 EUB             :     Page        rpage;
    2241                 :     BTPageOpaque rpageop;
    2242                 :     bool        wasroot;
    2243                 :     bool        wasonly;
    2244                 : 
    2245 UIC           0 :     Assert(P_INCOMPLETE_SPLIT(lpageop));
    2246                 : 
    2247                 :     /* Lock right sibling, the one missing the downlink */
    2248 UNC           0 :     rbuf = _bt_getbuf(rel, heaprel, lpageop->btpo_next, BT_WRITE);
    2249 UIC           0 :     rpage = BufferGetPage(rbuf);
    2250 UBC           0 :     rpageop = BTPageGetOpaque(rpage);
    2251                 : 
    2252                 :     /* Could this be a root split? */
    2253               0 :     if (!stack)
    2254 EUB             :     {
    2255                 :         Buffer      metabuf;
    2256                 :         Page        metapg;
    2257                 :         BTMetaPageData *metad;
    2258                 : 
    2259                 :         /* acquire lock on the metapage */
    2260 UNC           0 :         metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
    2261 UIC           0 :         metapg = BufferGetPage(metabuf);
    2262               0 :         metad = BTPageGetMeta(metapg);
    2263                 : 
    2264               0 :         wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
    2265 EUB             : 
    2266 UBC           0 :         _bt_relbuf(rel, metabuf);
    2267 EUB             :     }
    2268                 :     else
    2269 UBC           0 :         wasroot = false;
    2270                 : 
    2271 EUB             :     /* Was this the only page on the level before split? */
    2272 UIC           0 :     wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
    2273                 : 
    2274 UBC           0 :     elog(DEBUG1, "finishing incomplete split of %u/%u",
    2275                 :          BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
    2276                 : 
    2277 UNC           0 :     _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
    2278 UIC           0 : }
    2279 EUB             : 
    2280                 : /*
    2281                 :  *  _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
    2282                 :  *                       tuple whose downlink points to child page.
    2283                 :  *
    2284                 :  *      Caller passes child's block number, which is used to identify
    2285                 :  *      associated pivot tuple in parent page using a linear search that
    2286                 :  *      matches on pivot's downlink/block number.  The expected location of
    2287                 :  *      the pivot tuple is taken from the stack one level above the child
    2288                 :  *      page.  This is used as a starting point.  Insertions into the
    2289                 :  *      parent level could cause the pivot tuple to move right; deletions
    2290                 :  *      could cause it to move left, but not left of the page we previously
    2291                 :  *      found it on.
    2292                 :  *
    2293                 :  *      Caller can use its stack to relocate the pivot tuple/downlink for
    2294                 :  *      any same-level page to the right of the page found by its initial
    2295                 :  *      descent.  This is necessary because of the possibility that caller
    2296                 :  *      moved right to recover from a concurrent page split.  It's also
    2297                 :  *      convenient for certain callers to be able to step right when there
    2298                 :  *      wasn't a concurrent page split, while still using their original
    2299                 :  *      stack.  For example, the checkingunique _bt_doinsert() case may
    2300                 :  *      have to step right when there are many physical duplicates, and its
    2301                 :  *      scantid forces an insertion to the right of the "first page the
    2302                 :  *      value could be on".  (This is also relied on by all of our callers
    2303                 :  *      when dealing with !heapkeyspace indexes.)
    2304                 :  *
    2305                 :  *      Returns write-locked parent page buffer, or InvalidBuffer if pivot
    2306                 :  *      tuple not found (should not happen).  Adjusts bts_blkno &
    2307                 :  *      bts_offset if changed.  Page split caller should insert its new
    2308                 :  *      pivot tuple for its new right sibling page on parent page, at the
    2309                 :  *      offset number bts_offset + 1.
    2310                 :  */
    2311                 : Buffer
    2312 GNC       23619 : _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
    2313                 : {
    2314                 :     BlockNumber blkno;
    2315                 :     OffsetNumber start;
    2316                 : 
    2317 CBC       23619 :     blkno = stack->bts_blkno;
    2318 GIC       23619 :     start = stack->bts_offset;
    2319                 : 
    2320                 :     for (;;)
    2321               5 :     {
    2322 ECB             :         Buffer      buf;
    2323                 :         Page        page;
    2324                 :         BTPageOpaque opaque;
    2325                 : 
    2326 GNC       23624 :         buf = _bt_getbuf(rel, heaprel, blkno, BT_WRITE);
    2327 GIC       23624 :         page = BufferGetPage(buf);
    2328           23624 :         opaque = BTPageGetOpaque(page);
    2329                 : 
    2330           23624 :         if (P_INCOMPLETE_SPLIT(opaque))
    2331 ECB             :         {
    2332 UNC           0 :             _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
    2333 LBC           0 :             continue;
    2334                 :         }
    2335 ECB             : 
    2336 GIC       23624 :         if (!P_IGNORE(opaque))
    2337 EUB             :         {
    2338                 :             OffsetNumber offnum,
    2339                 :                         minoff,
    2340                 :                         maxoff;
    2341 ECB             :             ItemId      itemid;
    2342                 :             IndexTuple  item;
    2343                 : 
    2344 GIC       23622 :             minoff = P_FIRSTDATAKEY(opaque);
    2345           23622 :             maxoff = PageGetMaxOffsetNumber(page);
    2346                 : 
    2347                 :             /*
    2348                 :              * start = InvalidOffsetNumber means "search the whole page". We
    2349 ECB             :              * need this test anyway due to possibility that page has a high
    2350                 :              * key now when it didn't before.
    2351                 :              */
    2352 GIC       23622 :             if (start < minoff)
    2353              17 :                 start = minoff;
    2354                 : 
    2355                 :             /*
    2356                 :              * Need this check too, to guard against possibility that page
    2357 ECB             :              * split since we visited it originally.
    2358                 :              */
    2359 GIC       23622 :             if (start > maxoff)
    2360               3 :                 start = OffsetNumberNext(maxoff);
    2361                 : 
    2362                 :             /*
    2363                 :              * These loops will check every item on the page --- but in an
    2364 ECB             :              * order that's attuned to the probability of where it actually
    2365                 :              * is.  Scan to the right first, then to the left.
    2366                 :              */
    2367 GIC       23622 :             for (offnum = start;
    2368           24048 :                  offnum <= maxoff;
    2369             426 :                  offnum = OffsetNumberNext(offnum))
    2370                 :             {
    2371           24045 :                 itemid = PageGetItemId(page, offnum);
    2372 CBC       24045 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2373 ECB             : 
    2374 CBC       24045 :                 if (BTreeTupleGetDownLink(item) == child)
    2375                 :                 {
    2376 ECB             :                     /* Return accurate pointer to where link is now */
    2377 CBC       23619 :                     stack->bts_blkno = blkno;
    2378 GIC       23619 :                     stack->bts_offset = offnum;
    2379 CBC       23619 :                     return buf;
    2380                 :                 }
    2381                 :             }
    2382 ECB             : 
    2383 CBC           3 :             for (offnum = OffsetNumberPrev(start);
    2384             858 :                  offnum >= minoff;
    2385 GIC         855 :                  offnum = OffsetNumberPrev(offnum))
    2386                 :             {
    2387             855 :                 itemid = PageGetItemId(page, offnum);
    2388 CBC         855 :                 item = (IndexTuple) PageGetItem(page, itemid);
    2389 ECB             : 
    2390 CBC         855 :                 if (BTreeTupleGetDownLink(item) == child)
    2391                 :                 {
    2392 ECB             :                     /* Return accurate pointer to where link is now */
    2393 LBC           0 :                     stack->bts_blkno = blkno;
    2394 UIC           0 :                     stack->bts_offset = offnum;
    2395 LBC           0 :                     return buf;
    2396                 :                 }
    2397                 :             }
    2398 EUB             :         }
    2399                 : 
    2400                 :         /*
    2401                 :          * The item we're looking for moved right at least one page.
    2402                 :          *
    2403                 :          * Lehman and Yao couple/chain locks when moving right here, which we
    2404                 :          * can avoid.  See nbtree/README.
    2405                 :          */
    2406 GIC           5 :         if (P_RIGHTMOST(opaque))
    2407                 :         {
    2408 UIC           0 :             _bt_relbuf(rel, buf);
    2409               0 :             return InvalidBuffer;
    2410                 :         }
    2411 CBC           5 :         blkno = opaque->btpo_next;
    2412 GIC           5 :         start = InvalidOffsetNumber;
    2413 GBC           5 :         _bt_relbuf(rel, buf);
    2414 EUB             :     }
    2415                 : }
    2416 ECB             : 
    2417                 : /*
    2418                 :  *  _bt_newroot() -- Create a new root page for the index.
    2419                 :  *
    2420                 :  *      We've just split the old root page and need to create a new one.
    2421                 :  *      In order to do this, we add a new root page to the file, then lock
    2422                 :  *      the metadata page and update it.  This is guaranteed to be deadlock-
    2423                 :  *      free, because all readers release their locks on the metadata page
    2424                 :  *      before trying to lock the root, and all writers lock the root before
    2425                 :  *      trying to lock the metadata page.  We have a write lock on the old
    2426                 :  *      root page, so we have not introduced any cycles into the waits-for
    2427                 :  *      graph.
    2428                 :  *
    2429                 :  *      On entry, lbuf (the old root) and rbuf (its new peer) are write-
    2430                 :  *      locked. On exit, a new root page exists with entries for the
    2431                 :  *      two new children, metapage is updated and unlocked/unpinned.
    2432                 :  *      The new root buffer is returned to caller which has to unlock/unpin
    2433                 :  *      lbuf, rbuf & rootbuf.
    2434                 :  */
    2435                 : static Buffer
    2436 GNC        2618 : _bt_newroot(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
    2437                 : {
    2438                 :     Buffer      rootbuf;
    2439                 :     Page        lpage,
    2440                 :                 rootpage;
    2441 ECB             :     BlockNumber lbkno,
    2442                 :                 rbkno;
    2443                 :     BlockNumber rootblknum;
    2444                 :     BTPageOpaque rootopaque;
    2445                 :     BTPageOpaque lopaque;
    2446                 :     ItemId      itemid;
    2447                 :     IndexTuple  item;
    2448                 :     IndexTuple  left_item;
    2449                 :     Size        left_item_sz;
    2450                 :     IndexTuple  right_item;
    2451                 :     Size        right_item_sz;
    2452                 :     Buffer      metabuf;
    2453                 :     Page        metapg;
    2454                 :     BTMetaPageData *metad;
    2455                 : 
    2456 GIC        2618 :     lbkno = BufferGetBlockNumber(lbuf);
    2457            2618 :     rbkno = BufferGetBlockNumber(rbuf);
    2458            2618 :     lpage = BufferGetPage(lbuf);
    2459            2618 :     lopaque = BTPageGetOpaque(lpage);
    2460                 : 
    2461 ECB             :     /* get a new root page */
    2462 GNC        2618 :     rootbuf = _bt_getbuf(rel, heaprel, P_NEW, BT_WRITE);
    2463 CBC        2618 :     rootpage = BufferGetPage(rootbuf);
    2464            2618 :     rootblknum = BufferGetBlockNumber(rootbuf);
    2465                 : 
    2466                 :     /* acquire lock on the metapage */
    2467 GNC        2618 :     metabuf = _bt_getbuf(rel, heaprel, BTREE_METAPAGE, BT_WRITE);
    2468 CBC        2618 :     metapg = BufferGetPage(metabuf);
    2469            2618 :     metad = BTPageGetMeta(metapg);
    2470                 : 
    2471                 :     /*
    2472 ECB             :      * Create downlink item for left page (old root).  The key value used is
    2473                 :      * "minus infinity", a sentinel value that's reliably less than any real
    2474                 :      * key value that could appear in the left page.
    2475                 :      */
    2476 GIC        2618 :     left_item_sz = sizeof(IndexTupleData);
    2477            2618 :     left_item = (IndexTuple) palloc(left_item_sz);
    2478            2618 :     left_item->t_info = left_item_sz;
    2479            2618 :     BTreeTupleSetDownLink(left_item, lbkno);
    2480            2618 :     BTreeTupleSetNAtts(left_item, 0, false);
    2481 ECB             : 
    2482                 :     /*
    2483                 :      * Create downlink item for right page.  The key for it is obtained from
    2484                 :      * the "high key" position in the left page.
    2485                 :      */
    2486 GIC        2618 :     itemid = PageGetItemId(lpage, P_HIKEY);
    2487            2618 :     right_item_sz = ItemIdGetLength(itemid);
    2488            2618 :     item = (IndexTuple) PageGetItem(lpage, itemid);
    2489            2618 :     right_item = CopyIndexTuple(item);
    2490            2618 :     BTreeTupleSetDownLink(right_item, rbkno);
    2491 ECB             : 
    2492                 :     /* NO EREPORT(ERROR) from here till newroot op is logged */
    2493 CBC        2618 :     START_CRIT_SECTION();
    2494 ECB             : 
    2495                 :     /* upgrade metapage if needed */
    2496 GIC        2618 :     if (metad->btm_version < BTREE_NOVAC_VERSION)
    2497 UIC           0 :         _bt_upgrademetapage(metapg);
    2498 ECB             : 
    2499                 :     /* set btree special data */
    2500 GIC        2618 :     rootopaque = BTPageGetOpaque(rootpage);
    2501 CBC        2618 :     rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
    2502 GBC        2618 :     rootopaque->btpo_flags = BTP_ROOT;
    2503 GIC        2618 :     rootopaque->btpo_level =
    2504            2618 :         (BTPageGetOpaque(lpage))->btpo_level + 1;
    2505 CBC        2618 :     rootopaque->btpo_cycleid = 0;
    2506 ECB             : 
    2507                 :     /* update metapage data */
    2508 CBC        2618 :     metad->btm_root = rootblknum;
    2509            2618 :     metad->btm_level = rootopaque->btpo_level;
    2510            2618 :     metad->btm_fastroot = rootblknum;
    2511 GIC        2618 :     metad->btm_fastlevel = rootopaque->btpo_level;
    2512                 : 
    2513 ECB             :     /*
    2514                 :      * Insert the left page pointer into the new root page.  The root page is
    2515                 :      * the rightmost page on its level so there is no "high key" in it; the
    2516                 :      * two items will go into positions P_HIKEY and P_FIRSTKEY.
    2517                 :      *
    2518                 :      * Note: we *must* insert the two items in item-number order, for the
    2519                 :      * benefit of _bt_restore_page().
    2520                 :      */
    2521 GIC        2618 :     Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
    2522            2618 :     if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
    2523                 :                     false, false) == InvalidOffsetNumber)
    2524 UIC           0 :         elog(PANIC, "failed to add leftkey to new root page"
    2525                 :              " while splitting block %u of index \"%s\"",
    2526 ECB             :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2527                 : 
    2528                 :     /*
    2529 EUB             :      * insert the right page pointer into the new root page.
    2530                 :      */
    2531 GIC        2618 :     Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
    2532            2618 :     Assert(BTreeTupleGetNAtts(right_item, rel) <=
    2533                 :            IndexRelationGetNumberOfKeyAttributes(rel));
    2534            2618 :     if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
    2535                 :                     false, false) == InvalidOffsetNumber)
    2536 LBC           0 :         elog(PANIC, "failed to add rightkey to new root page"
    2537 ECB             :              " while splitting block %u of index \"%s\"",
    2538                 :              BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
    2539                 : 
    2540                 :     /* Clear the incomplete-split flag in the left child */
    2541 GBC        2618 :     Assert(P_INCOMPLETE_SPLIT(lopaque));
    2542 GIC        2618 :     lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
    2543            2618 :     MarkBufferDirty(lbuf);
    2544                 : 
    2545            2618 :     MarkBufferDirty(rootbuf);
    2546 CBC        2618 :     MarkBufferDirty(metabuf);
    2547 ECB             : 
    2548                 :     /* XLOG stuff */
    2549 GIC        2618 :     if (RelationNeedsWAL(rel))
    2550 ECB             :     {
    2551                 :         xl_btree_newroot xlrec;
    2552                 :         XLogRecPtr  recptr;
    2553                 :         xl_btree_metadata md;
    2554                 : 
    2555 GIC        2605 :         xlrec.rootblk = rootblknum;
    2556            2605 :         xlrec.level = metad->btm_level;
    2557                 : 
    2558            2605 :         XLogBeginInsert();
    2559            2605 :         XLogRegisterData((char *) &xlrec, SizeOfBtreeNewroot);
    2560 ECB             : 
    2561 CBC        2605 :         XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
    2562 GIC        2605 :         XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
    2563 CBC        2605 :         XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
    2564 ECB             : 
    2565 GIC        2605 :         Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
    2566 CBC        2605 :         md.version = metad->btm_version;
    2567            2605 :         md.root = rootblknum;
    2568            2605 :         md.level = metad->btm_level;
    2569 GIC        2605 :         md.fastroot = rootblknum;
    2570 CBC        2605 :         md.fastlevel = metad->btm_level;
    2571            2605 :         md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
    2572            2605 :         md.allequalimage = metad->btm_allequalimage;
    2573 ECB             : 
    2574 CBC        2605 :         XLogRegisterBufData(2, (char *) &md, sizeof(xl_btree_metadata));
    2575 ECB             : 
    2576                 :         /*
    2577                 :          * Direct access to page is not good but faster - we should implement
    2578                 :          * some new func in page API.
    2579                 :          */
    2580 GIC        2605 :         XLogRegisterBufData(0,
    2581            2605 :                             (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
    2582            2605 :                             ((PageHeader) rootpage)->pd_special -
    2583            2605 :                             ((PageHeader) rootpage)->pd_upper);
    2584                 : 
    2585 CBC        2605 :         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
    2586 ECB             : 
    2587 CBC        2605 :         PageSetLSN(lpage, recptr);
    2588            2605 :         PageSetLSN(rootpage, recptr);
    2589 GIC        2605 :         PageSetLSN(metapg, recptr);
    2590 ECB             :     }
    2591                 : 
    2592 CBC        2618 :     END_CRIT_SECTION();
    2593 ECB             : 
    2594                 :     /* done with metapage */
    2595 GIC        2618 :     _bt_relbuf(rel, metabuf);
    2596                 : 
    2597 CBC        2618 :     pfree(left_item);
    2598 GIC        2618 :     pfree(right_item);
    2599                 : 
    2600 CBC        2618 :     return rootbuf;
    2601                 : }
    2602 ECB             : 
    2603                 : /*
    2604                 :  *  _bt_pgaddtup() -- add a data item to a particular page during split.
    2605                 :  *
    2606                 :  *      The difference between this routine and a bare PageAddItem call is
    2607                 :  *      that this code can deal with the first data item on an internal btree
    2608                 :  *      page in passing.  This data item (which is called "firstright" within
    2609                 :  *      _bt_split()) has a key that must be treated as minus infinity after
    2610                 :  *      the split.  Therefore, we truncate away all attributes when caller
    2611                 :  *      specifies it's the first data item on page (downlink is not changed,
    2612                 :  *      though).  This extra step is only needed for the right page of an
    2613                 :  *      internal page split.  There is no need to do this for the first data
    2614                 :  *      item on the existing/left page, since that will already have been
    2615                 :  *      truncated during an earlier page split.
    2616                 :  *
    2617                 :  *      See _bt_split() for a high level explanation of why we truncate here.
    2618                 :  *      Note that this routine has nothing to do with suffix truncation,
    2619                 :  *      despite using some of the same infrastructure.
    2620                 :  */
    2621                 : static inline bool
    2622 GIC     6963551 : _bt_pgaddtup(Page page,
    2623                 :              Size itemsize,
    2624                 :              IndexTuple itup,
    2625                 :              OffsetNumber itup_off,
    2626                 :              bool newfirstdataitem)
    2627 ECB             : {
    2628                 :     IndexTupleData trunctuple;
    2629                 : 
    2630 GIC     6963551 :     if (newfirstdataitem)
    2631                 :     {
    2632             101 :         trunctuple = *itup;
    2633             101 :         trunctuple.t_info = sizeof(IndexTupleData);
    2634             101 :         BTreeTupleSetNAtts(&trunctuple, 0, false);
    2635 CBC         101 :         itup = &trunctuple;
    2636 GIC         101 :         itemsize = sizeof(IndexTupleData);
    2637 ECB             :     }
    2638                 : 
    2639 CBC     6963551 :     if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
    2640 ECB             :                              false) == InvalidOffsetNumber))
    2641 LBC           0 :         return false;
    2642                 : 
    2643 GIC     6963551 :     return true;
    2644 ECB             : }
    2645                 : 
    2646 EUB             : /*
    2647                 :  * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
    2648 ECB             :  *
    2649                 :  * There are three operations performed here: simple index deletion, bottom-up
    2650                 :  * index deletion, and deduplication.  If all three operations fail to free
    2651                 :  * enough space for the incoming item then caller will go on to split the
    2652                 :  * page.  We always consider simple deletion first.  If that doesn't work out
    2653                 :  * we consider alternatives.  Callers that only want us to consider simple
    2654                 :  * deletion (without any fallback) ask for that using the 'simpleonly'
    2655                 :  * argument.
    2656                 :  *
    2657                 :  * We usually pick only one alternative "complex" operation when simple
    2658                 :  * deletion alone won't prevent a page split.  The 'checkingunique',
    2659                 :  * 'uniquedup', and 'indexUnchanged' arguments are used for that.
    2660                 :  *
    2661                 :  * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
    2662                 :  * level flag was found set.  The flag was useful back when there wasn't
    2663                 :  * necessarily one single page for a duplicate tuple to go on (before heap TID
    2664                 :  * became a part of the key space in version 4 indexes).  But we don't
    2665                 :  * actually look at the flag anymore (it's not a gating condition for our
    2666                 :  * caller).  That would cause us to miss tuples that are safe to delete,
    2667                 :  * without getting any benefit in return.  We know that the alternative is to
    2668                 :  * split the page; scanning the line pointer array in passing won't have
    2669                 :  * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
    2670                 :  * all this because !heapkeyspace indexes must still do a "getting tired"
    2671                 :  * linear search, and so are likely to get some benefit from using it as a
    2672                 :  * gating condition.)
    2673                 :  */
    2674                 : static void
    2675 GIC       50681 : _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
    2676                 :                              BTInsertState insertstate,
    2677                 :                              bool simpleonly, bool checkingunique,
    2678                 :                              bool uniquedup, bool indexUnchanged)
    2679                 : {
    2680 ECB             :     OffsetNumber deletable[MaxIndexTuplesPerPage];
    2681 GIC       50681 :     int         ndeletable = 0;
    2682                 :     OffsetNumber offnum,
    2683                 :                 minoff,
    2684                 :                 maxoff;
    2685           50681 :     Buffer      buffer = insertstate->buf;
    2686 CBC       50681 :     BTScanInsert itup_key = insertstate->itup_key;
    2687 GIC       50681 :     Page        page = BufferGetPage(buffer);
    2688           50681 :     BTPageOpaque opaque = BTPageGetOpaque(page);
    2689                 : 
    2690 CBC       50681 :     Assert(P_ISLEAF(opaque));
    2691           50681 :     Assert(simpleonly || itup_key->heapkeyspace);
    2692           50681 :     Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
    2693 ECB             : 
    2694                 :     /*
    2695                 :      * Scan over all items to see which ones need to be deleted according to
    2696                 :      * LP_DEAD flags.  We'll usually manage to delete a few extra items that
    2697                 :      * are not marked LP_DEAD in passing.  Often the extra items that actually
    2698                 :      * end up getting deleted are items that would have had their LP_DEAD bit
    2699                 :      * set before long anyway (if we opted not to include them as extras).
    2700                 :      */
    2701 GIC       50681 :     minoff = P_FIRSTDATAKEY(opaque);
    2702           50681 :     maxoff = PageGetMaxOffsetNumber(page);
    2703           50681 :     for (offnum = minoff;
    2704        13208843 :          offnum <= maxoff;
    2705        13158162 :          offnum = OffsetNumberNext(offnum))
    2706 ECB             :     {
    2707 CBC    13158162 :         ItemId      itemId = PageGetItemId(page, offnum);
    2708 ECB             : 
    2709 CBC    13158162 :         if (ItemIdIsDead(itemId))
    2710          120725 :             deletable[ndeletable++] = offnum;
    2711                 :     }
    2712 ECB             : 
    2713 GIC       50681 :     if (ndeletable > 0)
    2714 ECB             :     {
    2715 CBC        4769 :         _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
    2716                 :                            insertstate->itup, minoff, maxoff);
    2717 GIC        4769 :         insertstate->bounds_valid = false;
    2718 ECB             : 
    2719                 :         /* Return when a page split has already been avoided */
    2720 CBC        4769 :         if (PageGetFreeSpace(page) >= insertstate->itemsz)
    2721 GIC       22013 :             return;
    2722 ECB             : 
    2723                 :         /* Might as well assume duplicates (if checkingunique) */
    2724 GIC         304 :         uniquedup = true;
    2725 ECB             :     }
    2726                 : 
    2727                 :     /*
    2728                 :      * We're done with simple deletion.  Return early with callers that only
    2729                 :      * call here so that simple deletion can be considered.  This includes
    2730                 :      * callers that explicitly ask for this and checkingunique callers that
    2731                 :      * probably don't have any version churn duplicates on the page.
    2732                 :      *
    2733                 :      * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
    2734                 :      * return at this point (or when we go on the try either or both of our
    2735                 :      * other strategies and they also fail).  We do not bother expending a
    2736                 :      * separate write to clear it, however.  Caller will definitely clear it
    2737                 :      * when it goes on to split the page (note also that the deduplication
    2738                 :      * process will clear the flag in passing, just to keep things tidy).
    2739                 :      */
    2740 GIC       46216 :     if (simpleonly || (checkingunique && !uniquedup))
    2741                 :     {
    2742           17413 :         Assert(!indexUnchanged);
    2743           17413 :         return;
    2744                 :     }
    2745 ECB             : 
    2746                 :     /* Assume bounds about to be invalidated (this is almost certain now) */
    2747 CBC       28803 :     insertstate->bounds_valid = false;
    2748 ECB             : 
    2749                 :     /*
    2750                 :      * Perform bottom-up index deletion pass when executor hint indicated that
    2751                 :      * incoming item is logically unchanged, or for a unique index that is
    2752                 :      * known to have physical duplicates for some other reason.  (There is a
    2753                 :      * large overlap between these two cases for a unique index.  It's worth
    2754                 :      * having both triggering conditions in order to apply the optimization in
    2755                 :      * the event of successive related INSERT and DELETE statements.)
    2756                 :      *
    2757                 :      * We'll go on to do a deduplication pass when a bottom-up pass fails to
    2758                 :      * delete an acceptable amount of free space (a significant fraction of
    2759                 :      * the page, or space for the new item, whichever is greater).
    2760                 :      *
    2761                 :      * Note: Bottom-up index deletion uses the same equality/equivalence
    2762                 :      * routines as deduplication internally.  However, it does not merge
    2763                 :      * together index tuples, so the same correctness considerations do not
    2764                 :      * apply.  We deliberately omit an index-is-allequalimage test here.
    2765                 :      */
    2766 GIC       32315 :     if ((indexUnchanged || uniquedup) &&
    2767            3512 :         _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
    2768             135 :         return;
    2769                 : 
    2770                 :     /* Perform deduplication pass (when enabled and index-is-allequalimage) */
    2771 CBC       28668 :     if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
    2772           28659 :         _bt_dedup_pass(rel, buffer, heapRel, insertstate->itup,
    2773           28659 :                        insertstate->itemsz, (indexUnchanged || uniquedup));
    2774                 : }
    2775                 : 
    2776 ECB             : /*
    2777                 :  * _bt_simpledel_pass - Simple index tuple deletion pass.
    2778                 :  *
    2779                 :  * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
    2780                 :  * of all such tuples are determined by caller (caller passes these to us as
    2781                 :  * its 'deletable' argument).
    2782                 :  *
    2783                 :  * We might also delete extra index tuples that turn out to be safe to delete
    2784                 :  * in passing (though they must be cheap to check in passing to begin with).
    2785                 :  * There is no certainty that any extra tuples will be deleted, though.  The
    2786                 :  * high level goal of the approach we take is to get the most out of each call
    2787                 :  * here (without noticeably increasing the per-call overhead compared to what
    2788                 :  * we need to do just to be able to delete the page's LP_DEAD-marked index
    2789                 :  * tuples).
    2790                 :  *
    2791                 :  * The number of extra index tuples that turn out to be deletable might
    2792                 :  * greatly exceed the number of LP_DEAD-marked index tuples due to various
    2793                 :  * locality related effects.  For example, it's possible that the total number
    2794                 :  * of table blocks (pointed to by all TIDs on the leaf page) is naturally
    2795                 :  * quite low, in which case we might end up checking if it's possible to
    2796                 :  * delete _most_ index tuples on the page (without the tableam needing to
    2797                 :  * access additional table blocks).  The tableam will sometimes stumble upon
    2798                 :  * _many_ extra deletable index tuples in indexes where this pattern is
    2799                 :  * common.
    2800                 :  *
    2801                 :  * See nbtree/README for further details on simple index tuple deletion.
    2802                 :  */
    2803                 : static void
    2804 GIC        4769 : _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
    2805                 :                    OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
    2806                 :                    OffsetNumber minoff, OffsetNumber maxoff)
    2807                 : {
    2808            4769 :     Page        page = BufferGetPage(buffer);
    2809 ECB             :     BlockNumber *deadblocks;
    2810                 :     int         ndeadblocks;
    2811                 :     TM_IndexDeleteOp delstate;
    2812                 :     OffsetNumber offnum;
    2813                 : 
    2814                 :     /* Get array of table blocks pointed to by LP_DEAD-set tuples */
    2815 GIC        4769 :     deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
    2816                 :                                 &ndeadblocks);
    2817                 : 
    2818                 :     /* Initialize tableam state that describes index deletion operation */
    2819            4769 :     delstate.irel = rel;
    2820 CBC        4769 :     delstate.iblknum = BufferGetBlockNumber(buffer);
    2821 GIC        4769 :     delstate.bottomup = false;
    2822            4769 :     delstate.bottomupfreespace = 0;
    2823            4769 :     delstate.ndeltids = 0;
    2824 CBC        4769 :     delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
    2825            4769 :     delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
    2826 ECB             : 
    2827 CBC        4769 :     for (offnum = minoff;
    2828         1219140 :          offnum <= maxoff;
    2829         1214371 :          offnum = OffsetNumberNext(offnum))
    2830 ECB             :     {
    2831 GIC     1214371 :         ItemId      itemid = PageGetItemId(page, offnum);
    2832 CBC     1214371 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    2833         1214371 :         TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
    2834         1214371 :         TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
    2835                 :         BlockNumber tidblock;
    2836 ECB             :         void       *match;
    2837                 : 
    2838 CBC     1214371 :         if (!BTreeTupleIsPosting(itup))
    2839 ECB             :         {
    2840 GIC     1179738 :             tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
    2841         1179738 :             match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2842                 :                             sizeof(BlockNumber), _bt_blk_cmp);
    2843 ECB             : 
    2844 GIC     1179738 :             if (!match)
    2845 ECB             :             {
    2846 CBC      790451 :                 Assert(!ItemIdIsDead(itemid));
    2847 GIC      790451 :                 continue;
    2848                 :             }
    2849 ECB             : 
    2850                 :             /*
    2851                 :              * TID's table block is among those pointed to by the TIDs from
    2852                 :              * LP_DEAD-bit set tuples on page -- add TID to deltids
    2853                 :              */
    2854 GIC      389287 :             odeltid->tid = itup->t_tid;
    2855          389287 :             odeltid->id = delstate.ndeltids;
    2856          389287 :             ostatus->idxoffnum = offnum;
    2857          389287 :             ostatus->knowndeletable = ItemIdIsDead(itemid);
    2858          389287 :             ostatus->promising = false; /* unused */
    2859 CBC      389287 :             ostatus->freespace = 0; /* unused */
    2860 ECB             : 
    2861 CBC      389287 :             delstate.ndeltids++;
    2862 ECB             :         }
    2863                 :         else
    2864                 :         {
    2865 GIC       34633 :             int         nitem = BTreeTupleGetNPosting(itup);
    2866 ECB             : 
    2867 GIC      181092 :             for (int p = 0; p < nitem; p++)
    2868                 :             {
    2869          146459 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, p);
    2870 ECB             : 
    2871 GIC      146459 :                 tidblock = ItemPointerGetBlockNumber(tid);
    2872 CBC      146459 :                 match = bsearch(&tidblock, deadblocks, ndeadblocks,
    2873                 :                                 sizeof(BlockNumber), _bt_blk_cmp);
    2874 ECB             : 
    2875 GIC      146459 :                 if (!match)
    2876 ECB             :                 {
    2877 CBC      127648 :                     Assert(!ItemIdIsDead(itemid));
    2878 GIC      127648 :                     continue;
    2879                 :                 }
    2880 ECB             : 
    2881                 :                 /*
    2882                 :                  * TID's table block is among those pointed to by the TIDs
    2883                 :                  * from LP_DEAD-bit set tuples on page -- add TID to deltids
    2884                 :                  */
    2885 GIC       18811 :                 odeltid->tid = *tid;
    2886           18811 :                 odeltid->id = delstate.ndeltids;
    2887           18811 :                 ostatus->idxoffnum = offnum;
    2888           18811 :                 ostatus->knowndeletable = ItemIdIsDead(itemid);
    2889           18811 :                 ostatus->promising = false; /* unused */
    2890 CBC       18811 :                 ostatus->freespace = 0; /* unused */
    2891 ECB             : 
    2892 CBC       18811 :                 odeltid++;
    2893           18811 :                 ostatus++;
    2894           18811 :                 delstate.ndeltids++;
    2895 ECB             :             }
    2896                 :         }
    2897                 :     }
    2898                 : 
    2899 CBC        4769 :     pfree(deadblocks);
    2900                 : 
    2901 GIC        4769 :     Assert(delstate.ndeltids >= ndeletable);
    2902                 : 
    2903                 :     /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
    2904 CBC        4769 :     _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
    2905                 : 
    2906            4769 :     pfree(delstate.deltids);
    2907 GIC        4769 :     pfree(delstate.status);
    2908            4769 : }
    2909 ECB             : 
    2910                 : /*
    2911                 :  * _bt_deadblocks() -- Get LP_DEAD related table blocks.
    2912                 :  *
    2913                 :  * Builds sorted and unique-ified array of table block numbers from index
    2914                 :  * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
    2915                 :  * block from incoming newitem just in case it isn't among the LP_DEAD-related
    2916                 :  * table blocks.
    2917                 :  *
    2918                 :  * Always counting the newitem's table block as an LP_DEAD related block makes
    2919                 :  * sense because the cost is consistently low; it is practically certain that
    2920                 :  * the table block will not incur a buffer miss in tableam.  On the other hand
    2921                 :  * the benefit is often quite high.  There is a decent chance that there will
    2922                 :  * be some deletable items from this block, since in general most garbage
    2923                 :  * tuples became garbage in the recent past (in many cases this won't be the
    2924                 :  * first logical row that core code added to/modified in table block
    2925                 :  * recently).
    2926                 :  *
    2927                 :  * Returns final array, and sets *nblocks to its final size for caller.
    2928                 :  */
    2929                 : static BlockNumber *
    2930 GIC        4769 : _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
    2931                 :                IndexTuple newitem, int *nblocks)
    2932                 : {
    2933                 :     int         spacentids,
    2934                 :                 ntids;
    2935 ECB             :     BlockNumber *tidblocks;
    2936                 : 
    2937                 :     /*
    2938                 :      * Accumulate each TID's block in array whose initial size has space for
    2939                 :      * one table block per LP_DEAD-set tuple (plus space for the newitem table
    2940                 :      * block).  Array will only need to grow when there are LP_DEAD-marked
    2941                 :      * posting list tuples (which is not that common).
    2942                 :      */
    2943 GIC        4769 :     spacentids = ndeletable + 1;
    2944            4769 :     ntids = 0;
    2945            4769 :     tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
    2946                 : 
    2947                 :     /*
    2948 ECB             :      * First add the table block for the incoming newitem.  This is the one
    2949                 :      * case where simple deletion can visit a table block that doesn't have
    2950                 :      * any known deletable items.
    2951                 :      */
    2952 GIC        4769 :     Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
    2953            4769 :     tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
    2954                 : 
    2955          125494 :     for (int i = 0; i < ndeletable; i++)
    2956                 :     {
    2957 CBC      120725 :         ItemId      itemid = PageGetItemId(page, deletable[i]);
    2958          120725 :         IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
    2959                 : 
    2960          120725 :         Assert(ItemIdIsDead(itemid));
    2961                 : 
    2962          120725 :         if (!BTreeTupleIsPosting(itup))
    2963 ECB             :         {
    2964 GIC      117259 :             if (ntids + 1 > spacentids)
    2965 ECB             :             {
    2966 GIC          88 :                 spacentids *= 2;
    2967 ECB             :                 tidblocks = (BlockNumber *)
    2968 GIC          88 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    2969 ECB             :             }
    2970                 : 
    2971 CBC      117259 :             tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
    2972                 :         }
    2973 ECB             :         else
    2974                 :         {
    2975 GIC        3466 :             int         nposting = BTreeTupleGetNPosting(itup);
    2976 ECB             : 
    2977 GIC        3466 :             if (ntids + nposting > spacentids)
    2978                 :             {
    2979              82 :                 spacentids = Max(spacentids * 2, ntids + nposting);
    2980 ECB             :                 tidblocks = (BlockNumber *)
    2981 GIC          82 :                     repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
    2982 ECB             :             }
    2983                 : 
    2984 CBC       11792 :             for (int j = 0; j < nposting; j++)
    2985                 :             {
    2986            8326 :                 ItemPointer tid = BTreeTupleGetPostingN(itup, j);
    2987                 : 
    2988 GIC        8326 :                 tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
    2989 ECB             :             }
    2990                 :         }
    2991                 :     }
    2992                 : 
    2993 CBC        4769 :     qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    2994 GIC        4769 :     *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
    2995                 : 
    2996            4769 :     return tidblocks;
    2997                 : }
    2998 ECB             : 
    2999                 : /*
    3000                 :  * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
    3001                 :  */
    3002                 : static inline int
    3003 GIC     2888450 : _bt_blk_cmp(const void *arg1, const void *arg2)
    3004                 : {
    3005         2888450 :     BlockNumber b1 = *((BlockNumber *) arg1);
    3006         2888450 :     BlockNumber b2 = *((BlockNumber *) arg2);
    3007                 : 
    3008 CBC     2888450 :     if (b1 < b2)
    3009 GIC     1570073 :         return -1;
    3010 CBC     1318377 :     else if (b1 > b2)
    3011          660386 :         return 1;
    3012                 : 
    3013          657991 :     return 0;
    3014 ECB             : }
         |