LCOV - differential code coverage report
Current view: top level - src/backend/access/nbtree - nbtsearch.c (source / functions) Coverage Total Hit UNC LBC UIC UBC GBC GIC GNC CBC EUB ECB DCB
Current: Differential Code Coverage HEAD vs 15 Lines: 87.2 % 710 619 3 20 45 23 11 375 16 217 57 378 4
Current Date: 2023-04-08 15:15:32 Functions: 100.0 % 20 20 17 3 19 1
Baseline: 15
Baseline Date: 2023-04-08 15:09:40
Legend: Lines: hit not hit

           TLA  Line data    Source code
       1                 : /*-------------------------------------------------------------------------
       2                 :  *
       3                 :  * nbtsearch.c
       4                 :  *    Search code for postgres btrees.
       5                 :  *
       6                 :  *
       7                 :  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
       8                 :  * Portions Copyright (c) 1994, Regents of the University of California
       9                 :  *
      10                 :  * IDENTIFICATION
      11                 :  *    src/backend/access/nbtree/nbtsearch.c
      12                 :  *
      13                 :  *-------------------------------------------------------------------------
      14                 :  */
      15                 : 
      16                 : #include "postgres.h"
      17                 : 
      18                 : #include "access/nbtree.h"
      19                 : #include "access/relscan.h"
      20                 : #include "miscadmin.h"
      21                 : #include "pgstat.h"
      22                 : #include "storage/predicate.h"
      23                 : #include "utils/lsyscache.h"
      24                 : #include "utils/rel.h"
      25                 : 
      26                 : 
      27                 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
      28                 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
      29                 : static int  _bt_binsrch_posting(BTScanInsert key, Page page,
      30                 :                                 OffsetNumber offnum);
      31                 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
      32                 :                          OffsetNumber offnum);
      33                 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
      34                 :                          OffsetNumber offnum, IndexTuple itup);
      35                 : static int  _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
      36                 :                                   OffsetNumber offnum, ItemPointer heapTid,
      37                 :                                   IndexTuple itup);
      38                 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
      39                 :                                        OffsetNumber offnum,
      40                 :                                        ItemPointer heapTid, int tupleOffset);
      41                 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
      42                 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
      43                 : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
      44                 :                                   ScanDirection dir);
      45                 : static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf,
      46                 :                             Snapshot snapshot);
      47                 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
      48                 : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
      49                 : 
      50                 : 
      51                 : /*
      52                 :  *  _bt_drop_lock_and_maybe_pin()
      53                 :  *
      54                 :  * Unlock the buffer; and if it is safe to release the pin, do that, too.
      55                 :  * This will prevent vacuum from stalling in a blocked state trying to read a
      56                 :  * page when a cursor is sitting on it.
      57                 :  *
      58                 :  * See nbtree/README section on making concurrent TID recycling safe.
      59                 :  */
      60                 : static void
      61 GIC     5022230 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
      62 ECB             : {
      63 GIC     5022230 :     _bt_unlockbuf(scan->indexRelation, sp->buf);
      64 ECB             : 
      65 GIC     5022230 :     if (IsMVCCSnapshot(scan->xs_snapshot) &&
      66 CBC     4849383 :         RelationNeedsWAL(scan->indexRelation) &&
      67         4846854 :         !scan->xs_want_itup)
      68 ECB             :     {
      69 GIC     4797883 :         ReleaseBuffer(sp->buf);
      70 CBC     4797883 :         sp->buf = InvalidBuffer;
      71 ECB             :     }
      72 GIC     5022230 : }
      73 ECB             : 
      74                 : /*
      75                 :  *  _bt_search() -- Search the tree for a particular scankey,
      76                 :  *      or more precisely for the first leaf page it could be on.
      77                 :  *
      78                 :  * The passed scankey is an insertion-type scankey (see nbtree/README),
      79                 :  * but it can omit the rightmost column(s) of the index.
      80                 :  *
      81                 :  * Return value is a stack of parent-page pointers (i.e. there is no entry for
      82                 :  * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
      83                 :  * which is locked and pinned.  No locks are held on the parent pages,
      84                 :  * however!
      85                 :  *
      86                 :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
      87                 :  * place during the descent through the tree.  This is not needed when
      88                 :  * positioning for an insert or delete, so NULL is used for those cases.
      89                 :  *
      90                 :  * The returned buffer is locked according to access parameter.  Additionally,
      91                 :  * access = BT_WRITE will allow an empty root page to be created and returned.
      92                 :  * When access = BT_READ, an empty index will result in *bufP being set to
      93                 :  * InvalidBuffer.  Also, in BT_WRITE mode, any incomplete splits encountered
      94                 :  * during the search will be finished.
      95                 :  */
      96                 : BTStack
      97 GNC    16390240 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
      98                 :            int access, Snapshot snapshot)
      99                 : {
     100 GIC    16390240 :     BTStack     stack_in = NULL;
     101 CBC    16390240 :     int         page_access = BT_READ;
     102 ECB             : 
     103                 :     /* Get the root page to start with */
     104 GNC    16390240 :     *bufP = _bt_getroot(rel, heaprel, access);
     105 ECB             : 
     106                 :     /* If index is empty and access = BT_READ, no root page is created. */
     107 GIC    16390240 :     if (!BufferIsValid(*bufP))
     108 CBC      385754 :         return (BTStack) NULL;
     109 ECB             : 
     110                 :     /* Loop iterates once per level descended in the tree */
     111                 :     for (;;)
     112 GIC    12274663 :     {
     113 ECB             :         Page        page;
     114                 :         BTPageOpaque opaque;
     115                 :         OffsetNumber offnum;
     116                 :         ItemId      itemid;
     117                 :         IndexTuple  itup;
     118                 :         BlockNumber child;
     119                 :         BTStack     new_stack;
     120                 : 
     121                 :         /*
     122                 :          * Race -- the page we just grabbed may have split since we read its
     123                 :          * downlink in its parent page (or the metapage).  If it has, we may
     124                 :          * need to move right to its new sibling.  Do that.
     125                 :          *
     126                 :          * In write-mode, allow _bt_moveright to finish any incomplete splits
     127                 :          * along the way.  Strictly speaking, we'd only need to finish an
     128                 :          * incomplete split on the leaf page we're about to insert to, not on
     129                 :          * any of the upper levels (internal pages with incomplete splits are
     130                 :          * also taken care of in _bt_getstackbuf).  But this is a good
     131                 :          * opportunity to finish splits of internal pages too.
     132                 :          */
     133 GNC    28279149 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
     134                 :                               stack_in, page_access, snapshot);
     135                 : 
     136                 :         /* if this is a leaf page, we're done */
     137 GIC    28279149 :         page = BufferGetPage(*bufP);
     138 CBC    28279149 :         opaque = BTPageGetOpaque(page);
     139        28279149 :         if (P_ISLEAF(opaque))
     140        16004486 :             break;
     141 ECB             : 
     142                 :         /*
     143                 :          * Find the appropriate pivot tuple on this page.  Its downlink points
     144                 :          * to the child page that we're about to descend to.
     145                 :          */
     146 GIC    12274663 :         offnum = _bt_binsrch(rel, key, *bufP);
     147 CBC    12274663 :         itemid = PageGetItemId(page, offnum);
     148        12274663 :         itup = (IndexTuple) PageGetItem(page, itemid);
     149        12274663 :         Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
     150        12274663 :         child = BTreeTupleGetDownLink(itup);
     151 ECB             : 
     152                 :         /*
     153                 :          * We need to save the location of the pivot tuple we chose in a new
     154                 :          * stack entry for this page/level.  If caller ends up splitting a
     155                 :          * page one level down, it usually ends up inserting a new pivot
     156                 :          * tuple/downlink immediately after the location recorded here.
     157                 :          */
     158 GIC    12274663 :         new_stack = (BTStack) palloc(sizeof(BTStackData));
     159 CBC    12274663 :         new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
     160        12274663 :         new_stack->bts_offset = offnum;
     161        12274663 :         new_stack->bts_parent = stack_in;
     162 ECB             : 
     163                 :         /*
     164                 :          * Page level 1 is lowest non-leaf page level prior to leaves.  So, if
     165                 :          * we're on the level 1 and asked to lock leaf page in write mode,
     166                 :          * then lock next page in write mode, because it must be a leaf.
     167                 :          */
     168 GIC    12274663 :         if (opaque->btpo_level == 1 && access == BT_WRITE)
     169 CBC     5665181 :             page_access = BT_WRITE;
     170 ECB             : 
     171                 :         /* drop the read lock on the page, then acquire one on its child */
     172 GIC    12274663 :         *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
     173 ECB             : 
     174                 :         /* okay, all set to move down a level */
     175 GIC    12274663 :         stack_in = new_stack;
     176 ECB             :     }
     177                 : 
     178                 :     /*
     179                 :      * If we're asked to lock leaf in write mode, but didn't manage to, then
     180                 :      * relock.  This should only happen when the root page is a leaf page (and
     181                 :      * the only page in the index other than the metapage).
     182                 :      */
     183 GIC    16004486 :     if (access == BT_WRITE && page_access == BT_READ)
     184 ECB             :     {
     185                 :         /* trade in our read lock for a write lock */
     186 GIC     1468243 :         _bt_unlockbuf(rel, *bufP);
     187 CBC     1468243 :         _bt_lockbuf(rel, *bufP, BT_WRITE);
     188 ECB             : 
     189                 :         /*
     190                 :          * Race -- the leaf page may have split after we dropped the read lock
     191                 :          * but before we acquired a write lock.  If it has, we may need to
     192                 :          * move right to its new sibling.  Do that.
     193                 :          */
     194 GNC     1468243 :         *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
     195 ECB             :                               snapshot);
     196                 :     }
     197                 : 
     198 GIC    16004486 :     return stack_in;
     199 ECB             : }
     200                 : 
     201                 : /*
     202                 :  *  _bt_moveright() -- move right in the btree if necessary.
     203                 :  *
     204                 :  * When we follow a pointer to reach a page, it is possible that
     205                 :  * the page has changed in the meanwhile.  If this happens, we're
     206                 :  * guaranteed that the page has "split right" -- that is, that any
     207                 :  * data that appeared on the page originally is either on the page
     208                 :  * or strictly to the right of it.
     209                 :  *
     210                 :  * This routine decides whether or not we need to move right in the
     211                 :  * tree by examining the high key entry on the page.  If that entry is
     212                 :  * strictly less than the scankey, or <= the scankey in the
     213                 :  * key.nextkey=true case, then we followed the wrong link and we need
     214                 :  * to move right.
     215                 :  *
     216                 :  * The passed insertion-type scankey can omit the rightmost column(s) of the
     217                 :  * index. (see nbtree/README)
     218                 :  *
     219                 :  * When key.nextkey is false (the usual case), we are looking for the first
     220                 :  * item >= key.  When key.nextkey is true, we are looking for the first item
     221                 :  * strictly greater than key.
     222                 :  *
     223                 :  * If forupdate is true, we will attempt to finish any incomplete splits
     224                 :  * that we encounter.  This is required when locking a target page for an
     225                 :  * insertion, because we don't allow inserting on a page before the split
     226                 :  * is completed.  'stack' is only used if forupdate is true.
     227                 :  *
     228                 :  * On entry, we have the buffer pinned and a lock of the type specified by
     229                 :  * 'access'.  If we move right, we release the buffer and lock and acquire
     230                 :  * the same on the right sibling.  Return value is the buffer we stop at.
     231                 :  *
     232                 :  * If the snapshot parameter is not NULL, "old snapshot" checking will take
     233                 :  * place during the descent through the tree.  This is not needed when
     234                 :  * positioning for an insert or delete, so NULL is used for those cases.
     235                 :  */
     236                 : Buffer
     237 GIC    29747392 : _bt_moveright(Relation rel,
     238                 :               Relation heaprel,
     239 ECB             :               BTScanInsert key,
     240                 :               Buffer buf,
     241                 :               bool forupdate,
     242                 :               BTStack stack,
     243                 :               int access,
     244                 :               Snapshot snapshot)
     245                 : {
     246                 :     Page        page;
     247                 :     BTPageOpaque opaque;
     248                 :     int32       cmpval;
     249                 : 
     250                 :     /*
     251                 :      * When nextkey = false (normal case): if the scan key that brought us to
     252                 :      * this page is > the high key stored on the page, then the page has split
     253                 :      * and we need to move right.  (pg_upgrade'd !heapkeyspace indexes could
     254                 :      * have some duplicates to the right as well as the left, but that's
     255                 :      * something that's only ever dealt with on the leaf level, after
     256                 :      * _bt_search has found an initial leaf page.)
     257                 :      *
     258                 :      * When nextkey = true: move right if the scan key is >= page's high key.
     259                 :      * (Note that key.scantid cannot be set in this case.)
     260                 :      *
     261                 :      * The page could even have split more than once, so scan as far as
     262                 :      * needed.
     263                 :      *
     264                 :      * We also have to move right if we followed a link that brought us to a
     265                 :      * dead page.
     266                 :      */
     267 GIC    29747392 :     cmpval = key->nextkey ? 0 : 1;
     268                 : 
     269 ECB             :     for (;;)
     270                 :     {
     271 GIC    29748068 :         page = BufferGetPage(buf);
     272        29748068 :         TestForOldSnapshot(snapshot, rel, page);
     273 CBC    29748068 :         opaque = BTPageGetOpaque(page);
     274 ECB             : 
     275 CBC    29748068 :         if (P_RIGHTMOST(opaque))
     276 GIC    23650629 :             break;
     277 ECB             : 
     278                 :         /*
     279                 :          * Finish any incomplete splits we encounter along the way.
     280                 :          */
     281 GIC     6097439 :         if (forupdate && P_INCOMPLETE_SPLIT(opaque))
     282 UIC           0 :         {
     283 LBC           0 :             BlockNumber blkno = BufferGetBlockNumber(buf);
     284 EUB             : 
     285                 :             /* upgrade our lock if necessary */
     286 UIC           0 :             if (access == BT_READ)
     287                 :             {
     288 UBC           0 :                 _bt_unlockbuf(rel, buf);
     289 UIC           0 :                 _bt_lockbuf(rel, buf, BT_WRITE);
     290 EUB             :             }
     291                 : 
     292 UIC           0 :             if (P_INCOMPLETE_SPLIT(opaque))
     293 UNC           0 :                 _bt_finish_split(rel, heaprel, buf, stack);
     294 EUB             :             else
     295 UBC           0 :                 _bt_relbuf(rel, buf);
     296                 : 
     297 EUB             :             /* re-acquire the lock in the right mode, and re-check */
     298 UNC           0 :             buf = _bt_getbuf(rel, heaprel, blkno, access);
     299 UIC           0 :             continue;
     300 EUB             :         }
     301                 : 
     302 GIC     6097439 :         if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
     303                 :         {
     304 ECB             :             /* step right one page */
     305 GIC         676 :             buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
     306             676 :             continue;
     307 ECB             :         }
     308                 :         else
     309                 :             break;
     310                 :     }
     311                 : 
     312 GIC    29747392 :     if (P_IGNORE(opaque))
     313 UIC           0 :         elog(ERROR, "fell off the end of index \"%s\"",
     314 ECB             :              RelationGetRelationName(rel));
     315 EUB             : 
     316 GIC    29747392 :     return buf;
     317                 : }
     318 ECB             : 
     319                 : /*
     320                 :  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
     321                 :  *
     322                 :  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
     323                 :  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
     324                 :  * particular, this means it is possible to return a value 1 greater than the
     325                 :  * number of keys on the page, if the scankey is > all keys on the page.)
     326                 :  *
     327                 :  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
     328                 :  * of the last key < given scankey, or last key <= given scankey if nextkey
     329                 :  * is true.  (Since _bt_compare treats the first data key of such a page as
     330                 :  * minus infinity, there will be at least one key < scankey, so the result
     331                 :  * always points at one of the keys on the page.)  This key indicates the
     332                 :  * right place to descend to be sure we find all leaf keys >= given scankey
     333                 :  * (or leaf keys > given scankey when nextkey is true).
     334                 :  *
     335                 :  * This procedure is not responsible for walking right, it just examines
     336                 :  * the given page.  _bt_binsrch() has no lock or refcount side effects
     337                 :  * on the buffer.
     338                 :  */
     339                 : static OffsetNumber
     340 GIC    20943015 : _bt_binsrch(Relation rel,
     341                 :             BTScanInsert key,
     342 ECB             :             Buffer buf)
     343                 : {
     344                 :     Page        page;
     345                 :     BTPageOpaque opaque;
     346                 :     OffsetNumber low,
     347                 :                 high;
     348                 :     int32       result,
     349                 :                 cmpval;
     350                 : 
     351 GIC    20943015 :     page = BufferGetPage(buf);
     352        20943015 :     opaque = BTPageGetOpaque(page);
     353 ECB             : 
     354                 :     /* Requesting nextkey semantics while using scantid seems nonsensical */
     355 GIC    20943015 :     Assert(!key->nextkey || key->scantid == NULL);
     356                 :     /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
     357 CBC    20943015 :     Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
     358                 : 
     359        20943015 :     low = P_FIRSTDATAKEY(opaque);
     360 GIC    20943015 :     high = PageGetMaxOffsetNumber(page);
     361 ECB             : 
     362                 :     /*
     363                 :      * If there are no keys on the page, return the first available slot. Note
     364                 :      * this covers two cases: the page is really empty (no keys), or it
     365                 :      * contains only a high key.  The latter case is possible after vacuuming.
     366                 :      * This can never happen on an internal page, however, since they are
     367                 :      * never empty (an internal page must have children).
     368                 :      */
     369 GIC    20943015 :     if (unlikely(high < low))
     370            2170 :         return low;
     371 ECB             : 
     372                 :     /*
     373                 :      * Binary search to find the first key on the page >= scan key, or first
     374                 :      * key > scankey when nextkey is true.
     375                 :      *
     376                 :      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     377                 :      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     378                 :      *
     379                 :      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     380                 :      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     381                 :      *
     382                 :      * We can fall out when high == low.
     383                 :      */
     384 GIC    20940845 :     high++;                     /* establish the loop invariant for high */
     385                 : 
     386 CBC    20940845 :     cmpval = key->nextkey ? 0 : 1;   /* select comparison value */
     387                 : 
     388       127472718 :     while (high > low)
     389                 :     {
     390       106531873 :         OffsetNumber mid = low + ((high - low) / 2);
     391                 : 
     392 ECB             :         /* We have low <= mid < high, so mid points at a real slot */
     393                 : 
     394 GIC   106531873 :         result = _bt_compare(rel, key, page, mid);
     395                 : 
     396 CBC   106531873 :         if (result >= cmpval)
     397 GIC    70298342 :             low = mid + 1;
     398 ECB             :         else
     399 CBC    36233531 :             high = mid;
     400                 :     }
     401 ECB             : 
     402                 :     /*
     403                 :      * At this point we have high == low, but be careful: they could point
     404                 :      * past the last slot on the page.
     405                 :      *
     406                 :      * On a leaf page, we always return the first key >= scan key (resp. >
     407                 :      * scan key), which could be the last slot + 1.
     408                 :      */
     409 GIC    20940845 :     if (P_ISLEAF(opaque))
     410         8666182 :         return low;
     411 ECB             : 
     412                 :     /*
     413                 :      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     414                 :      * There must be one if _bt_compare() is playing by the rules.
     415                 :      */
     416 GIC    12274663 :     Assert(low > P_FIRSTDATAKEY(opaque));
     417                 : 
     418 CBC    12274663 :     return OffsetNumberPrev(low);
     419                 : }
     420 ECB             : 
     421                 : /*
     422                 :  *
     423                 :  *  _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
     424                 :  *
     425                 :  * Like _bt_binsrch(), but with support for caching the binary search
     426                 :  * bounds.  Only used during insertion, and only on the leaf page that it
     427                 :  * looks like caller will insert tuple on.  Exclusive-locked and pinned
     428                 :  * leaf page is contained within insertstate.
     429                 :  *
     430                 :  * Caches the bounds fields in insertstate so that a subsequent call can
     431                 :  * reuse the low and strict high bounds of original binary search.  Callers
     432                 :  * that use these fields directly must be prepared for the case where low
     433                 :  * and/or stricthigh are not on the same page (one or both exceed maxoff
     434                 :  * for the page).  The case where there are no items on the page (high <
     435                 :  * low) makes bounds invalid.
     436                 :  *
     437                 :  * Caller is responsible for invalidating bounds when it modifies the page
     438                 :  * before calling here a second time, and for dealing with posting list
     439                 :  * tuple matches (callers can use insertstate's postingoff field to
     440                 :  * determine which existing heap TID will need to be replaced by a posting
     441                 :  * list split).
     442                 :  */
     443                 : OffsetNumber
     444 GIC    12616278 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
     445                 : {
     446 CBC    12616278 :     BTScanInsert key = insertstate->itup_key;
     447                 :     Page        page;
     448 ECB             :     BTPageOpaque opaque;
     449                 :     OffsetNumber low,
     450                 :                 high,
     451                 :                 stricthigh;
     452                 :     int32       result,
     453                 :                 cmpval;
     454                 : 
     455 GIC    12616278 :     page = BufferGetPage(insertstate->buf);
     456        12616278 :     opaque = BTPageGetOpaque(page);
     457 ECB             : 
     458 CBC    12616278 :     Assert(P_ISLEAF(opaque));
     459 GIC    12616278 :     Assert(!key->nextkey);
     460 CBC    12616278 :     Assert(insertstate->postingoff == 0);
     461 ECB             : 
     462 CBC    12616278 :     if (!insertstate->bounds_valid)
     463                 :     {
     464 ECB             :         /* Start new binary search */
     465 GIC     7384948 :         low = P_FIRSTDATAKEY(opaque);
     466         7384948 :         high = PageGetMaxOffsetNumber(page);
     467 ECB             :     }
     468                 :     else
     469                 :     {
     470                 :         /* Restore result of previous binary search against same page */
     471 GIC     5231330 :         low = insertstate->low;
     472         5231330 :         high = insertstate->stricthigh;
     473 ECB             :     }
     474                 : 
     475                 :     /* If there are no keys on the page, return the first available slot */
     476 GIC    12616278 :     if (unlikely(high < low))
     477                 :     {
     478 ECB             :         /* Caller can't reuse bounds */
     479 GIC       19103 :         insertstate->low = InvalidOffsetNumber;
     480           19103 :         insertstate->stricthigh = InvalidOffsetNumber;
     481 CBC       19103 :         insertstate->bounds_valid = false;
     482           19103 :         return low;
     483 ECB             :     }
     484                 : 
     485                 :     /*
     486                 :      * Binary search to find the first key on the page >= scan key. (nextkey
     487                 :      * is always false when inserting).
     488                 :      *
     489                 :      * The loop invariant is: all slots before 'low' are < scan key, all slots
     490                 :      * at or after 'high' are >= scan key.  'stricthigh' is > scan key, and is
     491                 :      * maintained to save additional search effort for caller.
     492                 :      *
     493                 :      * We can fall out when high == low.
     494                 :      */
     495 GIC    12597175 :     if (!insertstate->bounds_valid)
     496         7365845 :         high++;                 /* establish the loop invariant for high */
     497 CBC    12597175 :     stricthigh = high;          /* high initially strictly higher */
     498 ECB             : 
     499 CBC    12597175 :     cmpval = 1;                 /* !nextkey comparison value */
     500                 : 
     501        65283644 :     while (high > low)
     502                 :     {
     503        52686469 :         OffsetNumber mid = low + ((high - low) / 2);
     504                 : 
     505 ECB             :         /* We have low <= mid < high, so mid points at a real slot */
     506                 : 
     507 GIC    52686469 :         result = _bt_compare(rel, key, page, mid);
     508                 : 
     509 CBC    52686469 :         if (result >= cmpval)
     510 GIC    40616979 :             low = mid + 1;
     511 ECB             :         else
     512                 :         {
     513 GIC    12069490 :             high = mid;
     514        12069490 :             if (result != 0)
     515 CBC    11435404 :                 stricthigh = high;
     516 ECB             :         }
     517                 : 
     518                 :         /*
     519                 :          * If tuple at offset located by binary search is a posting list whose
     520                 :          * TID range overlaps with caller's scantid, perform posting list
     521                 :          * binary search to set postingoff for caller.  Caller must split the
     522                 :          * posting list when postingoff is set.  This should happen
     523                 :          * infrequently.
     524                 :          */
     525 GIC    52686469 :         if (unlikely(result == 0 && key->scantid != NULL))
     526                 :         {
     527 ECB             :             /*
     528                 :              * postingoff should never be set more than once per leaf page
     529                 :              * binary search.  That would mean that there are duplicate table
     530                 :              * TIDs in the index, which is never okay.  Check for that here.
     531                 :              */
     532 GIC      207807 :             if (insertstate->postingoff != 0)
     533 UIC           0 :                 ereport(ERROR,
     534 ECB             :                         (errcode(ERRCODE_INDEX_CORRUPTED),
     535 EUB             :                          errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
     536                 :                                          ItemPointerGetBlockNumber(key->scantid),
     537                 :                                          ItemPointerGetOffsetNumber(key->scantid),
     538                 :                                          low, stricthigh,
     539                 :                                          BufferGetBlockNumber(insertstate->buf),
     540                 :                                          RelationGetRelationName(rel))));
     541                 : 
     542 GIC      207807 :             insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
     543                 :         }
     544 ECB             :     }
     545                 : 
     546                 :     /*
     547                 :      * On a leaf page, a binary search always returns the first key >= scan
     548                 :      * key (at least in !nextkey case), which could be the last slot + 1. This
     549                 :      * is also the lower bound of cached search.
     550                 :      *
     551                 :      * stricthigh may also be the last slot + 1, which prevents caller from
     552                 :      * using bounds directly, but is still useful to us if we're called a
     553                 :      * second time with cached bounds (cached low will be < stricthigh when
     554                 :      * that happens).
     555                 :      */
     556 GIC    12597175 :     insertstate->low = low;
     557        12597175 :     insertstate->stricthigh = stricthigh;
     558 CBC    12597175 :     insertstate->bounds_valid = true;
     559 ECB             : 
     560 CBC    12597175 :     return low;
     561                 : }
     562 ECB             : 
     563                 : /*----------
     564                 :  *  _bt_binsrch_posting() -- posting list binary search.
     565                 :  *
     566                 :  * Helper routine for _bt_binsrch_insert().
     567                 :  *
     568                 :  * Returns offset into posting list where caller's scantid belongs.
     569                 :  *----------
     570                 :  */
     571                 : static int
     572 GIC      207807 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
     573                 : {
     574 ECB             :     IndexTuple  itup;
     575                 :     ItemId      itemid;
     576                 :     int         low,
     577                 :                 high,
     578                 :                 mid,
     579                 :                 res;
     580                 : 
     581                 :     /*
     582                 :      * If this isn't a posting tuple, then the index must be corrupt (if it is
     583                 :      * an ordinary non-pivot tuple then there must be an existing tuple with a
     584                 :      * heap TID that equals inserter's new heap TID/scantid).  Defensively
     585                 :      * check that tuple is a posting list tuple whose posting list range
     586                 :      * includes caller's scantid.
     587                 :      *
     588                 :      * (This is also needed because contrib/amcheck's rootdescend option needs
     589                 :      * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
     590                 :      */
     591 GIC      207807 :     itemid = PageGetItemId(page, offnum);
     592          207807 :     itup = (IndexTuple) PageGetItem(page, itemid);
     593 CBC      207807 :     if (!BTreeTupleIsPosting(itup))
     594          200041 :         return 0;
     595 ECB             : 
     596 CBC        7766 :     Assert(key->heapkeyspace && key->allequalimage);
     597                 : 
     598 ECB             :     /*
     599                 :      * In the event that posting list tuple has LP_DEAD bit set, indicate this
     600                 :      * to _bt_binsrch_insert() caller by returning -1, a sentinel value.  A
     601                 :      * second call to _bt_binsrch_insert() can take place when its caller has
     602                 :      * removed the dead item.
     603                 :      */
     604 GIC        7766 :     if (ItemIdIsDead(itemid))
     605 UIC           0 :         return -1;
     606 ECB             : 
     607 EUB             :     /* "high" is past end of posting list for loop invariant */
     608 GIC        7766 :     low = 0;
     609            7766 :     high = BTreeTupleGetNPosting(itup);
     610 CBC        7766 :     Assert(high >= 2);
     611 ECB             : 
     612 CBC       62476 :     while (high > low)
     613                 :     {
     614           54710 :         mid = low + ((high - low) / 2);
     615 GIC       54710 :         res = ItemPointerCompare(key->scantid,
     616 ECB             :                                  BTreeTupleGetPostingN(itup, mid));
     617                 : 
     618 GIC       54710 :         if (res > 0)
     619           28848 :             low = mid + 1;
     620 CBC       25862 :         else if (res < 0)
     621           25862 :             high = mid;
     622 ECB             :         else
     623 LBC           0 :             return mid;
     624                 :     }
     625 EUB             : 
     626                 :     /* Exact match not found */
     627 GIC        7766 :     return low;
     628                 : }
     629 ECB             : 
     630                 : /*----------
     631                 :  *  _bt_compare() -- Compare insertion-type scankey to tuple on a page.
     632                 :  *
     633                 :  *  page/offnum: location of btree item to be compared to.
     634                 :  *
     635                 :  *      This routine returns:
     636                 :  *          <0 if scankey < tuple at offnum;
     637                 :  *           0 if scankey == tuple at offnum;
     638                 :  *          >0 if scankey > tuple at offnum.
     639                 :  *
     640                 :  * NULLs in the keys are treated as sortable values.  Therefore
     641                 :  * "equality" does not necessarily mean that the item should be returned
     642                 :  * to the caller as a matching key.  Similarly, an insertion scankey
     643                 :  * with its scantid set is treated as equal to a posting tuple whose TID
     644                 :  * range overlaps with their scantid.  There generally won't be a
     645                 :  * matching TID in the posting tuple, which caller must handle
     646                 :  * themselves (e.g., by splitting the posting list tuple).
     647                 :  *
     648                 :  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
     649                 :  * "minus infinity": this routine will always claim it is less than the
     650                 :  * scankey.  The actual key value stored is explicitly truncated to 0
     651                 :  * attributes (explicitly minus infinity) with version 3+ indexes, but
     652                 :  * that isn't relied upon.  This allows us to implement the Lehman and
     653                 :  * Yao convention that the first down-link pointer is before the first
     654                 :  * key.  See backend/access/nbtree/README for details.
     655                 :  *----------
     656                 :  */
     657                 : int32
     658 GIC   177440905 : _bt_compare(Relation rel,
     659                 :             BTScanInsert key,
     660 ECB             :             Page page,
     661                 :             OffsetNumber offnum)
     662                 : {
     663 GIC   177440905 :     TupleDesc   itupdesc = RelationGetDescr(rel);
     664       177440905 :     BTPageOpaque opaque = BTPageGetOpaque(page);
     665 ECB             :     IndexTuple  itup;
     666                 :     ItemPointer heapTid;
     667                 :     ScanKey     scankey;
     668                 :     int         ncmpkey;
     669                 :     int         ntupatts;
     670                 :     int32       result;
     671                 : 
     672 GIC   177440905 :     Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
     673       177440905 :     Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
     674 CBC   177440905 :     Assert(key->heapkeyspace || key->scantid == NULL);
     675 ECB             : 
     676                 :     /*
     677                 :      * Force result ">" if target item is first data item on an internal page
     678                 :      * --- see NOTE above.
     679                 :      */
     680 GIC   177440905 :     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
     681         1889977 :         return 1;
     682 ECB             : 
     683 CBC   175550928 :     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
     684 GIC   175550928 :     ntupatts = BTreeTupleGetNAtts(itup, rel);
     685 ECB             : 
     686                 :     /*
     687                 :      * The scan key is set up with the attribute number associated with each
     688                 :      * term in the key.  It is important that, if the index is multi-key, the
     689                 :      * scan contain the first k key attributes, and that they be in order.  If
     690                 :      * you think about how multi-key ordering works, you'll understand why
     691                 :      * this is.
     692                 :      *
     693                 :      * We don't test for violation of this condition here, however.  The
     694                 :      * initial setup for the index scan had better have gotten it right (see
     695                 :      * _bt_first).
     696                 :      */
     697                 : 
     698 GIC   175550928 :     ncmpkey = Min(ntupatts, key->keysz);
     699       175550928 :     Assert(key->heapkeyspace || ncmpkey == key->keysz);
     700 CBC   175550928 :     Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
     701       175550928 :     scankey = key->scankeys;
     702       220720244 :     for (int i = 1; i <= ncmpkey; i++)
     703 ECB             :     {
     704                 :         Datum       datum;
     705                 :         bool        isNull;
     706                 : 
     707 GIC   207864745 :         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
     708                 : 
     709 CBC   207864745 :         if (scankey->sk_flags & SK_ISNULL)   /* key is NULL */
     710                 :         {
     711          170918 :             if (isNull)
     712 GIC          83 :                 result = 0;     /* NULL "=" NULL */
     713 CBC      170835 :             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     714             132 :                 result = -1;    /* NULL "<" NOT_NULL */
     715 ECB             :             else
     716 CBC      170703 :                 result = 1;     /* NULL ">" NOT_NULL */
     717                 :         }
     718       207693827 :         else if (isNull)        /* key is NOT_NULL and item is NULL */
     719                 :         {
     720             132 :             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
     721 UIC           0 :                 result = 1;     /* NOT_NULL ">" NULL */
     722 ECB             :             else
     723 GBC         132 :                 result = -1;    /* NOT_NULL "<" NULL */
     724                 :         }
     725 ECB             :         else
     726                 :         {
     727                 :             /*
     728                 :              * The sk_func needs to be passed the index value as left arg and
     729                 :              * the sk_argument as right arg (they might be of different
     730                 :              * types).  Since it is convenient for callers to think of
     731                 :              * _bt_compare as comparing the scankey to the index item, we have
     732                 :              * to flip the sign of the comparison result.  (Unless it's a DESC
     733                 :              * column, in which case we *don't* flip the sign.)
     734                 :              */
     735 GIC   207693695 :             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
     736                 :                                                      scankey->sk_collation,
     737 ECB             :                                                      datum,
     738                 :                                                      scankey->sk_argument));
     739                 : 
     740 GIC   207693695 :             if (!(scankey->sk_flags & SK_BT_DESC))
     741       207693692 :                 INVERT_COMPARE_RESULT(result);
     742 ECB             :         }
     743                 : 
     744                 :         /* if the keys are unequal, return the difference */
     745 GIC   207864745 :         if (result != 0)
     746       162695429 :             return result;
     747 ECB             : 
     748 CBC    45169316 :         scankey++;
     749                 :     }
     750 ECB             : 
     751                 :     /*
     752                 :      * All non-truncated attributes (other than heap TID) were found to be
     753                 :      * equal.  Treat truncated attributes as minus infinity when scankey has a
     754                 :      * key attribute value that would otherwise be compared directly.
     755                 :      *
     756                 :      * Note: it doesn't matter if ntupatts includes non-key attributes;
     757                 :      * scankey won't, so explicitly excluding non-key attributes isn't
     758                 :      * necessary.
     759                 :      */
     760 GIC    12855499 :     if (key->keysz > ntupatts)
     761          125172 :         return 1;
     762 ECB             : 
     763                 :     /*
     764                 :      * Use the heap TID attribute and scantid to try to break the tie.  The
     765                 :      * rules are the same as any other key attribute -- only the
     766                 :      * representation differs.
     767                 :      */
     768 GIC    12730327 :     heapTid = BTreeTupleGetHeapTID(itup);
     769        12730327 :     if (key->scantid == NULL)
     770 ECB             :     {
     771                 :         /*
     772                 :          * Most searches have a scankey that is considered greater than a
     773                 :          * truncated pivot tuple if and when the scankey has equal values for
     774                 :          * attributes up to and including the least significant untruncated
     775                 :          * attribute in tuple.
     776                 :          *
     777                 :          * For example, if an index has the minimum two attributes (single
     778                 :          * user key attribute, plus heap TID attribute), and a page's high key
     779                 :          * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
     780                 :          * will not descend to the page to the left.  The search will descend
     781                 :          * right instead.  The truncated attribute in pivot tuple means that
     782                 :          * all non-pivot tuples on the page to the left are strictly < 'foo',
     783                 :          * so it isn't necessary to descend left.  In other words, search
     784                 :          * doesn't have to descend left because it isn't interested in a match
     785                 :          * that has a heap TID value of -inf.
     786                 :          *
     787                 :          * However, some searches (pivotsearch searches) actually require that
     788                 :          * we descend left when this happens.  -inf is treated as a possible
     789                 :          * match for omitted scankey attribute(s).  This is needed by page
     790                 :          * deletion, which must re-find leaf pages that are targets for
     791                 :          * deletion using their high keys.
     792                 :          *
     793                 :          * Note: the heap TID part of the test ensures that scankey is being
     794                 :          * compared to a pivot tuple with one or more truncated key
     795                 :          * attributes.
     796                 :          *
     797                 :          * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
     798                 :          * left here, since they have no heap TID attribute (and cannot have
     799                 :          * any -inf key values in any case, since truncation can only remove
     800                 :          * non-key attributes).  !heapkeyspace searches must always be
     801                 :          * prepared to deal with matches on both sides of the pivot once the
     802                 :          * leaf level is reached.
     803                 :          */
     804 GIC     9213987 :         if (key->heapkeyspace && !key->pivotsearch &&
     805         9207253 :             key->keysz == ntupatts && heapTid == NULL)
     806 CBC        7404 :             return 1;
     807 ECB             : 
     808                 :         /* All provided scankey arguments found to be equal */
     809 GIC     9206583 :         return 0;
     810                 :     }
     811 ECB             : 
     812                 :     /*
     813                 :      * Treat truncated heap TID as minus infinity, since scankey has a key
     814                 :      * attribute value (scantid) that would otherwise be compared directly
     815                 :      */
     816 GIC     3516340 :     Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
     817         3516340 :     if (heapTid == NULL)
     818 CBC        1978 :         return 1;
     819 ECB             : 
     820                 :     /*
     821                 :      * Scankey must be treated as equal to a posting list tuple if its scantid
     822                 :      * value falls within the range of the posting list.  In all other cases
     823                 :      * there can only be a single heap TID value, which is compared directly
     824                 :      * with scantid.
     825                 :      */
     826 GIC     3514362 :     Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
     827         3514362 :     result = ItemPointerCompare(key->scantid, heapTid);
     828 CBC     3513995 :     if (result <= 0 || !BTreeTupleIsPosting(itup))
     829         3394046 :         return result;
     830 ECB             :     else
     831                 :     {
     832 GIC      119949 :         result = ItemPointerCompare(key->scantid,
     833                 :                                     BTreeTupleGetMaxHeapTID(itup));
     834 CBC      119949 :         if (result > 0)
     835 GIC      112183 :             return 1;
     836 ECB             :     }
     837                 : 
     838 GIC        7766 :     return 0;
     839                 : }
     840 ECB             : 
     841                 : /*
     842                 :  *  _bt_first() -- Find the first item in a scan.
     843                 :  *
     844                 :  *      We need to be clever about the direction of scan, the search
     845                 :  *      conditions, and the tree ordering.  We find the first item (or,
     846                 :  *      if backwards scan, the last item) in the tree that satisfies the
     847                 :  *      qualifications in the scan key.  On success exit, the page containing
     848                 :  *      the current index tuple is pinned but not locked, and data about
     849                 :  *      the matching tuple(s) on the page has been loaded into so->currPos.
     850                 :  *      scan->xs_ctup.t_self is set to the heap TID of the current tuple,
     851                 :  *      and if requested, scan->xs_itup points to a copy of the index tuple.
     852                 :  *
     853                 :  * If there are no matching items in the index, we return false, with no
     854                 :  * pins or locks held.
     855                 :  *
     856                 :  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
     857                 :  * are both search-type scankeys (see nbtree/README for more about this).
     858                 :  * Within this routine, we build a temporary insertion-type scankey to use
     859                 :  * in locating the scan start position.
     860                 :  */
     861                 : bool
     862 GIC     9087979 : _bt_first(IndexScanDesc scan, ScanDirection dir)
     863                 : {
     864 CBC     9087979 :     Relation    rel = scan->indexRelation;
     865 GNC     9087979 :     Relation    heaprel = scan->heapRelation;
     866 GIC     9087979 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
     867 ECB             :     Buffer      buf;
     868                 :     BTStack     stack;
     869                 :     OffsetNumber offnum;
     870                 :     StrategyNumber strat;
     871                 :     bool        nextkey;
     872                 :     bool        goback;
     873                 :     BTScanInsertData inskey;
     874                 :     ScanKey     startKeys[INDEX_MAX_KEYS];
     875                 :     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
     876 GIC     9087979 :     int         keysCount = 0;
     877                 :     int         i;
     878                 :     bool        status;
     879 ECB             :     StrategyNumber strat_total;
     880                 :     BTScanPosItem *currItem;
     881                 :     BlockNumber blkno;
     882                 : 
     883 GIC     9087979 :     Assert(!BTScanPosIsValid(so->currPos));
     884                 : 
     885         9087979 :     pgstat_count_index_scan(rel);
     886 ECB             : 
     887                 :     /*
     888                 :      * Examine the scan keys and eliminate any redundant keys; also mark the
     889                 :      * keys that must be matched to continue the scan.
     890                 :      */
     891 GIC     9087979 :     _bt_preprocess_keys(scan);
     892                 : 
     893                 :     /*
     894 ECB             :      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     895                 :      * never be satisfied (eg, x == 1 AND x > 2).
     896                 :      */
     897 GIC     9087979 :     if (!so->qual_ok)
     898                 :     {
     899                 :         /* Notify any other workers that we're done with this scan key. */
     900 CBC        1286 :         _bt_parallel_done(scan);
     901 GIC        1286 :         return false;
     902                 :     }
     903 ECB             : 
     904                 :     /*
     905                 :      * For parallel scans, get the starting page from shared state. If the
     906                 :      * scan has not started, proceed to find out first leaf page in the usual
     907                 :      * way while keeping other participating processes waiting.  If the scan
     908                 :      * has already begun, use the page number from the shared structure.
     909                 :      */
     910 GIC     9086693 :     if (scan->parallel_scan != NULL)
     911                 :     {
     912             184 :         status = _bt_parallel_seize(scan, &blkno);
     913 CBC         184 :         if (!status)
     914 GIC         140 :             return false;
     915 CBC          44 :         else if (blkno == P_NONE)
     916 ECB             :         {
     917 CBC           2 :             _bt_parallel_done(scan);
     918               2 :             return false;
     919                 :         }
     920              42 :         else if (blkno != InvalidBlockNumber)
     921 ECB             :         {
     922 GIC           4 :             if (!_bt_parallel_readpage(scan, blkno, dir))
     923 LBC           0 :                 return false;
     924 GIC           4 :             goto readcomplete;
     925 ECB             :         }
     926 EUB             :     }
     927 ECB             : 
     928                 :     /*----------
     929                 :      * Examine the scan keys to discover where we need to start the scan.
     930                 :      *
     931                 :      * We want to identify the keys that can be used as starting boundaries;
     932                 :      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     933                 :      * a backwards scan.  We can use keys for multiple attributes so long as
     934                 :      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     935                 :      * a > or < boundary or find an attribute with no boundary (which can be
     936                 :      * thought of as the same as "> -infinity"), we can't use keys for any
     937                 :      * attributes to its right, because it would break our simplistic notion
     938                 :      * of what initial positioning strategy to use.
     939                 :      *
     940                 :      * When the scan keys include cross-type operators, _bt_preprocess_keys
     941                 :      * may not be able to eliminate redundant keys; in such cases we will
     942                 :      * arbitrarily pick a usable one for each attribute.  This is correct
     943                 :      * but possibly not optimal behavior.  (For example, with keys like
     944                 :      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     945                 :      * x=5 would be more efficient.)  Since the situation only arises given
     946                 :      * a poorly-worded query plus an incomplete opfamily, live with it.
     947                 :      *
     948                 :      * When both equality and inequality keys appear for a single attribute
     949                 :      * (again, only possible when cross-type operators appear), we *must*
     950                 :      * select one of the equality keys for the starting point, because
     951                 :      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     952                 :      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     953                 :      * start at x=4, we will fail and stop before reaching x=10.  If multiple
     954                 :      * equality quals survive preprocessing, however, it doesn't matter which
     955                 :      * one we use --- by definition, they are either redundant or
     956                 :      * contradictory.
     957                 :      *
     958                 :      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     959                 :      * If the index stores nulls at the end of the index we'll be starting
     960                 :      * from, and we have no boundary key for the column (which means the key
     961                 :      * we deduced NOT NULL from is an inequality key that constrains the other
     962                 :      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     963                 :      * use as a boundary key.  If we didn't do this, we might find ourselves
     964                 :      * traversing a lot of null entries at the start of the scan.
     965                 :      *
     966                 :      * In this loop, row-comparison keys are treated the same as keys on their
     967                 :      * first (leftmost) columns.  We'll add on lower-order columns of the row
     968                 :      * comparison below, if possible.
     969                 :      *
     970                 :      * The selected scan keys (at most one per index column) are remembered by
     971                 :      * storing their addresses into the local startKeys[] array.
     972                 :      *----------
     973                 :      */
     974 GIC     9086547 :     strat_total = BTEqualStrategyNumber;
     975         9086547 :     if (so->numberOfKeys > 0)
     976                 :     {
     977 ECB             :         AttrNumber  curattr;
     978                 :         ScanKey     chosen;
     979                 :         ScanKey     impliesNN;
     980                 :         ScanKey     cur;
     981                 : 
     982                 :         /*
     983                 :          * chosen is the so-far-chosen key for the current attribute, if any.
     984                 :          * We don't cast the decision in stone until we reach keys for the
     985                 :          * next attribute.
     986                 :          */
     987 GIC     9081689 :         curattr = 1;
     988         9081689 :         chosen = NULL;
     989                 :         /* Also remember any scankey that implies a NOT NULL constraint */
     990 CBC     9081689 :         impliesNN = NULL;
     991 ECB             : 
     992                 :         /*
     993                 :          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
     994                 :          * pass to handle after-last-key processing.  Actual exit from the
     995                 :          * loop is at one of the "break" statements below.
     996                 :          */
     997 GIC    24989949 :         for (cur = so->keyData, i = 0;; cur++, i++)
     998                 :         {
     999        24989949 :             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
    1000 ECB             :             {
    1001                 :                 /*
    1002                 :                  * Done looking at keys for curattr.  If we didn't find a
    1003                 :                  * usable boundary key, see if we can deduce a NOT NULL key.
    1004                 :                  */
    1005 GIC    15934530 :                 if (chosen == NULL && impliesNN != NULL &&
    1006           25904 :                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1007                 :                      ScanDirectionIsForward(dir) :
    1008 ECB             :                      ScanDirectionIsBackward(dir)))
    1009                 :                 {
    1010                 :                     /* Yes, so build the key in notnullkeys[keysCount] */
    1011 GIC           3 :                     chosen = &notnullkeys[keysCount];
    1012               3 :                     ScanKeyEntryInitialize(chosen,
    1013                 :                                            (SK_SEARCHNOTNULL | SK_ISNULL |
    1014 CBC           3 :                                             (impliesNN->sk_flags &
    1015 ECB             :                                              (SK_BT_DESC | SK_BT_NULLS_FIRST))),
    1016                 :                                            curattr,
    1017 CBC           3 :                                            ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
    1018                 :                                             BTGreaterStrategyNumber :
    1019                 :                                             BTLessStrategyNumber),
    1020 ECB             :                                            InvalidOid,
    1021                 :                                            InvalidOid,
    1022                 :                                            InvalidOid,
    1023                 :                                            (Datum) 0);
    1024                 :                 }
    1025                 : 
    1026                 :                 /*
    1027                 :                  * If we still didn't find a usable boundary key, quit; else
    1028                 :                  * save the boundary key pointer in startKeys.
    1029                 :                  */
    1030 GIC    15908626 :                 if (chosen == NULL)
    1031           27175 :                     break;
    1032        15881451 :                 startKeys[keysCount++] = chosen;
    1033 ECB             : 
    1034                 :                 /*
    1035                 :                  * Adjust strat_total, and quit if we have stored a > or <
    1036                 :                  * key.
    1037                 :                  */
    1038 GIC    15881451 :                 strat = chosen->sk_strategy;
    1039        15881451 :                 if (strat != BTEqualStrategyNumber)
    1040                 :                 {
    1041 CBC      709058 :                     strat_total = strat;
    1042          709058 :                     if (strat == BTGreaterStrategyNumber ||
    1043                 :                         strat == BTLessStrategyNumber)
    1044 ECB             :                         break;
    1045                 :                 }
    1046                 : 
    1047                 :                 /*
    1048                 :                  * Done if that was the last attribute, or if next key is not
    1049                 :                  * in sequence (implying no boundary key is available for the
    1050                 :                  * next attribute).
    1051                 :                  */
    1052 GIC    15174487 :                 if (i >= so->numberOfKeys ||
    1053         6827667 :                     cur->sk_attno != curattr + 1)
    1054                 :                     break;
    1055 ECB             : 
    1056                 :                 /*
    1057                 :                  * Reset for next attr.
    1058                 :                  */
    1059 GIC     6826937 :                 curattr = cur->sk_attno;
    1060         6826937 :                 chosen = NULL;
    1061         6826937 :                 impliesNN = NULL;
    1062 ECB             :             }
    1063                 : 
    1064                 :             /*
    1065                 :              * Can we use this key as a starting boundary for this attr?
    1066                 :              *
    1067                 :              * If not, does it imply a NOT NULL constraint?  (Because
    1068                 :              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
    1069                 :              * *any* inequality key works for that; we need not test.)
    1070                 :              */
    1071 GIC    15908260 :             switch (cur->sk_strategy)
    1072                 :             {
    1073           48301 :                 case BTLessStrategyNumber:
    1074 ECB             :                 case BTLessEqualStrategyNumber:
    1075 GIC       48301 :                     if (chosen == NULL)
    1076 ECB             :                     {
    1077 GIC       47399 :                         if (ScanDirectionIsBackward(dir))
    1078 CBC       21504 :                             chosen = cur;
    1079                 :                         else
    1080           25895 :                             impliesNN = cur;
    1081 ECB             :                     }
    1082 GIC       48301 :                     break;
    1083 CBC    15172393 :                 case BTEqualStrategyNumber:
    1084                 :                     /* override any non-equality choice */
    1085        15172393 :                     chosen = cur;
    1086        15172393 :                     break;
    1087 GIC      687566 :                 case BTGreaterEqualStrategyNumber:
    1088 ECB             :                 case BTGreaterStrategyNumber:
    1089 CBC      687566 :                     if (chosen == NULL)
    1090 ECB             :                     {
    1091 GIC      687566 :                         if (ScanDirectionIsForward(dir))
    1092 CBC      687551 :                             chosen = cur;
    1093                 :                         else
    1094              15 :                             impliesNN = cur;
    1095 ECB             :                     }
    1096 GIC      687566 :                     break;
    1097 ECB             :             }
    1098                 :         }
    1099                 :     }
    1100                 : 
    1101                 :     /*
    1102                 :      * If we found no usable boundary keys, we have to start from one end of
    1103                 :      * the tree.  Walk down that edge to the first or last key, and scan from
    1104                 :      * there.
    1105                 :      */
    1106 GIC     9086547 :     if (keysCount == 0)
    1107                 :     {
    1108                 :         bool        match;
    1109 ECB             : 
    1110 GIC       31957 :         match = _bt_endpoint(scan, dir);
    1111                 : 
    1112           31957 :         if (!match)
    1113 ECB             :         {
    1114                 :             /* No match, so mark (parallel) scan finished */
    1115 CBC        3978 :             _bt_parallel_done(scan);
    1116                 :         }
    1117                 : 
    1118           31957 :         return match;
    1119                 :     }
    1120                 : 
    1121 ECB             :     /*
    1122                 :      * We want to start the scan somewhere within the index.  Set up an
    1123                 :      * insertion scankey we can use to search for the boundary point we
    1124                 :      * identified above.  The insertion scankey is built using the keys
    1125                 :      * identified by startKeys[].  (Remaining insertion scankey fields are
    1126                 :      * initialized after initial-positioning strategy is finalized.)
    1127                 :      */
    1128 GIC     9054590 :     Assert(keysCount <= INDEX_MAX_KEYS);
    1129        24936029 :     for (i = 0; i < keysCount; i++)
    1130                 :     {
    1131 CBC    15881451 :         ScanKey     cur = startKeys[i];
    1132 ECB             : 
    1133 GIC    15881451 :         Assert(cur->sk_attno == i + 1);
    1134 ECB             : 
    1135 GIC    15881451 :         if (cur->sk_flags & SK_ROW_HEADER)
    1136 ECB             :         {
    1137                 :             /*
    1138                 :              * Row comparison header: look to the first row member instead.
    1139                 :              *
    1140                 :              * The member scankeys are already in insertion format (ie, they
    1141                 :              * have sk_func = 3-way-comparison function), but we have to watch
    1142                 :              * out for nulls, which _bt_preprocess_keys didn't check. A null
    1143                 :              * in the first row member makes the condition unmatchable, just
    1144                 :              * like qual_ok = false.
    1145                 :              */
    1146 GIC          12 :             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
    1147                 : 
    1148              12 :             Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1149 CBC          12 :             if (subkey->sk_flags & SK_ISNULL)
    1150                 :             {
    1151 LBC           0 :                 _bt_parallel_done(scan);
    1152               0 :                 return false;
    1153                 :             }
    1154 GBC          12 :             memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
    1155 EUB             : 
    1156                 :             /*
    1157 ECB             :              * If the row comparison is the last positioning key we accepted,
    1158                 :              * try to add additional keys from the lower-order row members.
    1159                 :              * (If we accepted independent conditions on additional index
    1160                 :              * columns, we use those instead --- doesn't seem worth trying to
    1161                 :              * determine which is more restrictive.)  Note that this is OK
    1162                 :              * even if the row comparison is of ">" or "<" type, because the
    1163                 :              * condition applied to all but the last row member is effectively
    1164                 :              * ">=" or "<=", and so the extra keys don't break the positioning
    1165                 :              * scheme.  But, by the same token, if we aren't able to use all
    1166                 :              * the row members, then the part of the row comparison that we
    1167                 :              * did use has to be treated as just a ">=" or "<=" condition, and
    1168                 :              * so we'd better adjust strat_total accordingly.
    1169                 :              */
    1170 GIC          12 :             if (i == keysCount - 1)
    1171                 :             {
    1172              12 :                 bool        used_all_subkeys = false;
    1173 ECB             : 
    1174 GIC          12 :                 Assert(!(subkey->sk_flags & SK_ROW_END));
    1175 ECB             :                 for (;;)
    1176                 :                 {
    1177 CBC          12 :                     subkey++;
    1178 GIC          12 :                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
    1179              12 :                     if (subkey->sk_attno != keysCount + 1)
    1180 LBC           0 :                         break;  /* out-of-sequence, can't use it */
    1181 CBC          12 :                     if (subkey->sk_strategy != cur->sk_strategy)
    1182 LBC           0 :                         break;  /* wrong direction, can't use it */
    1183 GBC          12 :                     if (subkey->sk_flags & SK_ISNULL)
    1184 LBC           0 :                         break;  /* can't use null keys */
    1185 GBC          12 :                     Assert(keysCount < INDEX_MAX_KEYS);
    1186 CBC          12 :                     memcpy(inskey.scankeys + keysCount, subkey,
    1187 EUB             :                            sizeof(ScanKeyData));
    1188 CBC          12 :                     keysCount++;
    1189              12 :                     if (subkey->sk_flags & SK_ROW_END)
    1190                 :                     {
    1191              12 :                         used_all_subkeys = true;
    1192              12 :                         break;
    1193                 :                     }
    1194 ECB             :                 }
    1195 CBC          12 :                 if (!used_all_subkeys)
    1196                 :                 {
    1197 UIC           0 :                     switch (strat_total)
    1198 ECB             :                     {
    1199 UIC           0 :                         case BTLessStrategyNumber:
    1200 UBC           0 :                             strat_total = BTLessEqualStrategyNumber;
    1201 UIC           0 :                             break;
    1202 UBC           0 :                         case BTGreaterStrategyNumber:
    1203               0 :                             strat_total = BTGreaterEqualStrategyNumber;
    1204               0 :                             break;
    1205 EUB             :                     }
    1206                 :                 }
    1207 GBC          12 :                 break;          /* done with outer loop */
    1208                 :             }
    1209                 :         }
    1210 ECB             :         else
    1211                 :         {
    1212                 :             /*
    1213                 :              * Ordinary comparison key.  Transform the search-style scan key
    1214                 :              * to an insertion scan key by replacing the sk_func with the
    1215                 :              * appropriate btree comparison function.
    1216                 :              *
    1217                 :              * If scankey operator is not a cross-type comparison, we can use
    1218                 :              * the cached comparison function; otherwise gotta look it up in
    1219                 :              * the catalogs.  (That can't lead to infinite recursion, since no
    1220                 :              * indexscan initiated by syscache lookup will use cross-data-type
    1221                 :              * operators.)
    1222                 :              *
    1223                 :              * We support the convention that sk_subtype == InvalidOid means
    1224                 :              * the opclass input type; this is a hack to simplify life for
    1225                 :              * ScanKeyInit().
    1226                 :              */
    1227 GIC    15881439 :             if (cur->sk_subtype == rel->rd_opcintype[i] ||
    1228        15621462 :                 cur->sk_subtype == InvalidOid)
    1229        15875108 :             {
    1230 ECB             :                 FmgrInfo   *procinfo;
    1231                 : 
    1232 CBC    15875108 :                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
    1233 GIC    15875108 :                 ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
    1234                 :                                                cur->sk_flags,
    1235 CBC    15875108 :                                                cur->sk_attno,
    1236 ECB             :                                                InvalidStrategy,
    1237                 :                                                cur->sk_subtype,
    1238                 :                                                cur->sk_collation,
    1239                 :                                                procinfo,
    1240                 :                                                cur->sk_argument);
    1241                 :             }
    1242                 :             else
    1243                 :             {
    1244                 :                 RegProcedure cmp_proc;
    1245                 : 
    1246 GIC        6331 :                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
    1247            6331 :                                              rel->rd_opcintype[i],
    1248                 :                                              cur->sk_subtype,
    1249 ECB             :                                              BTORDER_PROC);
    1250 CBC        6331 :                 if (!RegProcedureIsValid(cmp_proc))
    1251 UIC           0 :                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
    1252                 :                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
    1253 ECB             :                          cur->sk_attno, RelationGetRelationName(rel));
    1254 GBC        6331 :                 ScanKeyEntryInitialize(inskey.scankeys + i,
    1255                 :                                        cur->sk_flags,
    1256 GIC        6331 :                                        cur->sk_attno,
    1257 ECB             :                                        InvalidStrategy,
    1258                 :                                        cur->sk_subtype,
    1259                 :                                        cur->sk_collation,
    1260                 :                                        cmp_proc,
    1261                 :                                        cur->sk_argument);
    1262                 :             }
    1263                 :         }
    1264                 :     }
    1265                 : 
    1266                 :     /*----------
    1267                 :      * Examine the selected initial-positioning strategy to determine exactly
    1268                 :      * where we need to start the scan, and set flag variables to control the
    1269                 :      * code below.
    1270                 :      *
    1271                 :      * If nextkey = false, _bt_search and _bt_binsrch will locate the first
    1272                 :      * item >= scan key.  If nextkey = true, they will locate the first
    1273                 :      * item > scan key.
    1274                 :      *
    1275                 :      * If goback = true, we will then step back one item, while if
    1276                 :      * goback = false, we will start the scan on the located item.
    1277                 :      *----------
    1278                 :      */
    1279 GIC     9054590 :     switch (strat_total)
    1280                 :     {
    1281           21507 :         case BTLessStrategyNumber:
    1282 ECB             : 
    1283                 :             /*
    1284                 :              * Find first item >= scankey, then back up one to arrive at last
    1285                 :              * item < scankey.  (Note: this positioning strategy is only used
    1286                 :              * for a backward scan, so that is always the correct starting
    1287                 :              * position.)
    1288                 :              */
    1289 GIC       21507 :             nextkey = false;
    1290           21507 :             goback = true;
    1291           21507 :             break;
    1292 ECB             : 
    1293 LBC           0 :         case BTLessEqualStrategyNumber:
    1294 ECB             : 
    1295                 :             /*
    1296 EUB             :              * Find first item > scankey, then back up one to arrive at last
    1297                 :              * item <= scankey.  (Note: this positioning strategy is only used
    1298                 :              * for a backward scan, so that is always the correct starting
    1299                 :              * position.)
    1300                 :              */
    1301 UIC           0 :             nextkey = true;
    1302               0 :             goback = true;
    1303               0 :             break;
    1304 EUB             : 
    1305 GBC     8345532 :         case BTEqualStrategyNumber:
    1306 EUB             : 
    1307                 :             /*
    1308 ECB             :              * If a backward scan was specified, need to start with last equal
    1309                 :              * item not first one.
    1310                 :              */
    1311 GIC     8345532 :             if (ScanDirectionIsBackward(dir))
    1312                 :             {
    1313                 :                 /*
    1314 ECB             :                  * This is the same as the <= strategy.  We will check at the
    1315                 :                  * end whether the found item is actually =.
    1316                 :                  */
    1317 GIC          64 :                 nextkey = true;
    1318              64 :                 goback = true;
    1319                 :             }
    1320 ECB             :             else
    1321                 :             {
    1322                 :                 /*
    1323                 :                  * This is the same as the >= strategy.  We will check at the
    1324                 :                  * end whether the found item is actually =.
    1325                 :                  */
    1326 GIC     8345468 :                 nextkey = false;
    1327         8345468 :                 goback = false;
    1328                 :             }
    1329 CBC     8345532 :             break;
    1330 ECB             : 
    1331 GIC        2094 :         case BTGreaterEqualStrategyNumber:
    1332 ECB             : 
    1333                 :             /*
    1334                 :              * Find first item >= scankey.  (This is only used for forward
    1335                 :              * scans.)
    1336                 :              */
    1337 GIC        2094 :             nextkey = false;
    1338            2094 :             goback = false;
    1339            2094 :             break;
    1340 ECB             : 
    1341 CBC      685457 :         case BTGreaterStrategyNumber:
    1342 ECB             : 
    1343                 :             /*
    1344                 :              * Find first item > scankey.  (This is only used for forward
    1345                 :              * scans.)
    1346                 :              */
    1347 GIC      685457 :             nextkey = true;
    1348          685457 :             goback = false;
    1349          685457 :             break;
    1350 ECB             : 
    1351 LBC           0 :         default:
    1352 ECB             :             /* can't get here, but keep compiler quiet */
    1353 UIC           0 :             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
    1354 EUB             :             return false;
    1355                 :     }
    1356                 : 
    1357                 :     /* Initialize remaining insertion scan key fields */
    1358 GNC     9054590 :     _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage);
    1359 GIC     9054590 :     inskey.anynullkeys = false; /* unused */
    1360         9054590 :     inskey.nextkey = nextkey;
    1361 CBC     9054590 :     inskey.pivotsearch = false;
    1362         9054590 :     inskey.scantid = NULL;
    1363         9054590 :     inskey.keysz = keysCount;
    1364 ECB             : 
    1365                 :     /*
    1366                 :      * Use the manufactured insertion scan key to descend the tree and
    1367                 :      * position ourselves on the target leaf page.
    1368                 :      */
    1369 GNC     9054590 :     stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot);
    1370                 : 
    1371                 :     /* don't need to keep the stack around... */
    1372 CBC     9054590 :     _bt_freestack(stack);
    1373                 : 
    1374 GIC     9054590 :     if (!BufferIsValid(buf))
    1375 ECB             :     {
    1376                 :         /*
    1377                 :          * We only get here if the index is completely empty. Lock relation
    1378                 :          * because nothing finer to lock exists.
    1379                 :          */
    1380 GIC      385759 :         PredicateLockRelation(rel, scan->xs_snapshot);
    1381                 : 
    1382                 :         /*
    1383 ECB             :          * mark parallel scan as done, so that all the workers can finish
    1384                 :          * their scan
    1385                 :          */
    1386 GIC      385759 :         _bt_parallel_done(scan);
    1387          385759 :         BTScanPosInvalidate(so->currPos);
    1388                 : 
    1389 CBC      385759 :         return false;
    1390 ECB             :     }
    1391                 :     else
    1392 CBC     8668831 :         PredicateLockPage(rel, BufferGetBlockNumber(buf),
    1393                 :                           scan->xs_snapshot);
    1394                 : 
    1395         8668831 :     _bt_initialize_more_data(so, dir);
    1396                 : 
    1397                 :     /* position to the precise item on the page */
    1398         8668831 :     offnum = _bt_binsrch(rel, &inskey, buf);
    1399                 : 
    1400                 :     /*
    1401 ECB             :      * If nextkey = false, we are positioned at the first item >= scan key, or
    1402                 :      * possibly at the end of a page on which all the existing items are less
    1403                 :      * than the scan key and we know that everything on later pages is greater
    1404                 :      * than or equal to scan key.
    1405                 :      *
    1406                 :      * If nextkey = true, we are positioned at the first item > scan key, or
    1407                 :      * possibly at the end of a page on which all the existing items are less
    1408                 :      * than or equal to the scan key and we know that everything on later
    1409                 :      * pages is greater than scan key.
    1410                 :      *
    1411                 :      * The actually desired starting point is either this item or the prior
    1412                 :      * one, or in the end-of-page case it's the first item on the next page or
    1413                 :      * the last item on this page.  Adjust the starting offset if needed. (If
    1414                 :      * this results in an offset before the first item or after the last one,
    1415                 :      * _bt_readpage will report no items found, and then we'll step to the
    1416                 :      * next page as needed.)
    1417                 :      */
    1418 GIC     8668831 :     if (goback)
    1419           21567 :         offnum = OffsetNumberPrev(offnum);
    1420                 : 
    1421 ECB             :     /* remember which buffer we have pinned, if any */
    1422 CBC     8668831 :     Assert(!BTScanPosIsValid(so->currPos));
    1423 GIC     8668831 :     so->currPos.buf = buf;
    1424                 : 
    1425 ECB             :     /*
    1426                 :      * Now load data from the first page of the scan.
    1427                 :      */
    1428 GIC     8668831 :     if (!_bt_readpage(scan, dir, offnum))
    1429                 :     {
    1430                 :         /*
    1431 ECB             :          * There's no actually-matching data on this page.  Try to advance to
    1432                 :          * the next page.  Return false if there's no matching data at all.
    1433                 :          */
    1434 GIC     3688427 :         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    1435         3688427 :         if (!_bt_steppage(scan, dir))
    1436         3688414 :             return false;
    1437 ECB             :     }
    1438                 :     else
    1439                 :     {
    1440                 :         /* Drop the lock, and maybe the pin, on the current page */
    1441 GIC     4980404 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1442                 :     }
    1443                 : 
    1444 CBC     4980421 : readcomplete:
    1445                 :     /* OK, itemIndex says what to return */
    1446 GIC     4980421 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1447 CBC     4980421 :     scan->xs_heaptid = currItem->heapTid;
    1448 GIC     4980421 :     if (scan->xs_want_itup)
    1449 CBC       64578 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1450 ECB             : 
    1451 CBC     4980421 :     return true;
    1452 ECB             : }
    1453                 : 
    1454                 : /*
    1455                 :  *  _bt_next() -- Get the next item in a scan.
    1456                 :  *
    1457                 :  *      On entry, so->currPos describes the current page, which may be pinned
    1458                 :  *      but is not locked, and so->currPos.itemIndex identifies which item was
    1459                 :  *      previously returned.
    1460                 :  *
    1461                 :  *      On successful exit, scan->xs_ctup.t_self is set to the TID of the
    1462                 :  *      next heap tuple, and if requested, scan->xs_itup points to a copy of
    1463                 :  *      the index tuple.  so->currPos is updated as needed.
    1464                 :  *
    1465                 :  *      On failure exit (no more tuples), we release pin and set
    1466                 :  *      so->currPos.buf to InvalidBuffer.
    1467                 :  */
    1468                 : bool
    1469 GIC    11957219 : _bt_next(IndexScanDesc scan, ScanDirection dir)
    1470                 : {
    1471        11957219 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1472 ECB             :     BTScanPosItem *currItem;
    1473                 : 
    1474                 :     /*
    1475                 :      * Advance to next tuple on current page; or if there's no more, try to
    1476                 :      * step to the next page with data.
    1477                 :      */
    1478 GIC    11957219 :     if (ScanDirectionIsForward(dir))
    1479                 :     {
    1480        11948015 :         if (++so->currPos.itemIndex > so->currPos.lastItem)
    1481 ECB             :         {
    1482 GIC     1161767 :             if (!_bt_steppage(scan, dir))
    1483 CBC     1147466 :                 return false;
    1484                 :         }
    1485 ECB             :     }
    1486                 :     else
    1487                 :     {
    1488 GIC        9204 :         if (--so->currPos.itemIndex < so->currPos.firstItem)
    1489                 :         {
    1490              37 :             if (!_bt_steppage(scan, dir))
    1491 CBC          31 :                 return false;
    1492                 :         }
    1493 ECB             :     }
    1494                 : 
    1495                 :     /* OK, itemIndex says what to return */
    1496 GIC    10809722 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    1497        10809722 :     scan->xs_heaptid = currItem->heapTid;
    1498        10809722 :     if (scan->xs_want_itup)
    1499 CBC     1788657 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    1500 ECB             : 
    1501 CBC    10809722 :     return true;
    1502 ECB             : }
    1503                 : 
    1504                 : /*
    1505                 :  *  _bt_readpage() -- Load data from current index page into so->currPos
    1506                 :  *
    1507                 :  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
    1508                 :  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
    1509                 :  * they are updated as appropriate.  All other fields of so->currPos are
    1510                 :  * initialized from scratch here.
    1511                 :  *
    1512                 :  * We scan the current page starting at offnum and moving in the indicated
    1513                 :  * direction.  All items matching the scan keys are loaded into currPos.items.
    1514                 :  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
    1515                 :  * that there can be no more matching tuples in the current scan direction.
    1516                 :  *
    1517                 :  * In the case of a parallel scan, caller must have called _bt_parallel_seize
    1518                 :  * prior to calling this function; this function will invoke
    1519                 :  * _bt_parallel_release before returning.
    1520                 :  *
    1521                 :  * Returns true if any matching items found on the page, false if none.
    1522                 :  */
    1523                 : static bool
    1524 GIC     8712708 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
    1525                 : {
    1526         8712708 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1527 ECB             :     Page        page;
    1528                 :     BTPageOpaque opaque;
    1529                 :     OffsetNumber minoff;
    1530                 :     OffsetNumber maxoff;
    1531                 :     int         itemIndex;
    1532                 :     bool        continuescan;
    1533                 :     int         indnatts;
    1534                 : 
    1535                 :     /*
    1536                 :      * We must have the buffer pinned and locked, but the usual macro can't be
    1537                 :      * used here; this function is what makes it good for currPos.
    1538                 :      */
    1539 GIC     8712708 :     Assert(BufferIsValid(so->currPos.buf));
    1540                 : 
    1541         8712708 :     page = BufferGetPage(so->currPos.buf);
    1542 CBC     8712708 :     opaque = BTPageGetOpaque(page);
    1543                 : 
    1544 ECB             :     /* allow next page be processed by parallel worker */
    1545 CBC     8712708 :     if (scan->parallel_scan)
    1546                 :     {
    1547 GIC         644 :         if (ScanDirectionIsForward(dir))
    1548 CBC         644 :             _bt_parallel_release(scan, opaque->btpo_next);
    1549                 :         else
    1550 LBC           0 :             _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    1551 ECB             :     }
    1552                 : 
    1553 GBC     8712708 :     continuescan = true;        /* default assumption */
    1554 GIC     8712708 :     indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
    1555         8712708 :     minoff = P_FIRSTDATAKEY(opaque);
    1556 CBC     8712708 :     maxoff = PageGetMaxOffsetNumber(page);
    1557 ECB             : 
    1558                 :     /*
    1559                 :      * We note the buffer's block number so that we can release the pin later.
    1560                 :      * This allows us to re-read the buffer if it is needed again for hinting.
    1561                 :      */
    1562 GIC     8712708 :     so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
    1563                 : 
    1564                 :     /*
    1565 ECB             :      * We save the LSN of the page as we read it, so that we know whether it
    1566                 :      * safe to apply LP_DEAD hints to the page later.  This allows us to drop
    1567                 :      * the pin for MVCC scans, which allows vacuum to avoid blocking.
    1568                 :      */
    1569 GIC     8712708 :     so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
    1570                 : 
    1571                 :     /*
    1572 ECB             :      * we must save the page's right-link while scanning it; this tells us
    1573                 :      * where to step right to after we're done with these items.  There is no
    1574                 :      * corresponding need for the left-link, since splits always go right.
    1575                 :      */
    1576 GIC     8712708 :     so->currPos.nextPage = opaque->btpo_next;
    1577                 : 
    1578                 :     /* initialize tuple workspace to empty */
    1579 CBC     8712708 :     so->currPos.nextTupleOffset = 0;
    1580                 : 
    1581                 :     /*
    1582 ECB             :      * Now that the current page has been made consistent, the macro should be
    1583                 :      * good.
    1584                 :      */
    1585 GIC     8712708 :     Assert(BTScanPosIsPinned(so->currPos));
    1586                 : 
    1587         8712708 :     if (ScanDirectionIsForward(dir))
    1588 ECB             :     {
    1589                 :         /* load items[] in ascending order */
    1590 CBC     8691103 :         itemIndex = 0;
    1591                 : 
    1592 GIC     8691103 :         offnum = Max(offnum, minoff);
    1593 ECB             : 
    1594 GIC    30578310 :         while (offnum <= maxoff)
    1595 ECB             :         {
    1596 GIC    27999519 :             ItemId      iid = PageGetItemId(page, offnum);
    1597 ECB             :             IndexTuple  itup;
    1598                 : 
    1599                 :             /*
    1600                 :              * If the scan specifies not to return killed tuples, then we
    1601                 :              * treat a killed tuple as not passing the qual
    1602                 :              */
    1603 GIC    27999519 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1604                 :             {
    1605         1157894 :                 offnum = OffsetNumberNext(offnum);
    1606 CBC     1157894 :                 continue;
    1607                 :             }
    1608 ECB             : 
    1609 CBC    26841625 :             itup = (IndexTuple) PageGetItem(page, iid);
    1610                 : 
    1611 GIC    26841625 :             if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
    1612 ECB             :             {
    1613                 :                 /* tuple passes all scan key conditions */
    1614 CBC    20556895 :                 if (!BTreeTupleIsPosting(itup))
    1615                 :                 {
    1616                 :                     /* Remember it */
    1617        20272487 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1618 GIC    20272487 :                     itemIndex++;
    1619                 :                 }
    1620 ECB             :                 else
    1621                 :                 {
    1622                 :                     int         tupleOffset;
    1623                 : 
    1624                 :                     /*
    1625                 :                      * Set up state to return posting list, and remember first
    1626                 :                      * TID
    1627                 :                      */
    1628                 :                     tupleOffset =
    1629 GIC      284408 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1630                 :                                               BTreeTupleGetPostingN(itup, 0),
    1631                 :                                               itup);
    1632 CBC      284408 :                     itemIndex++;
    1633                 :                     /* Remember additional TIDs */
    1634 GIC     1444153 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1635 ECB             :                     {
    1636 GIC     1159745 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1637 ECB             :                                             BTreeTupleGetPostingN(itup, i),
    1638                 :                                             tupleOffset);
    1639 CBC     1159745 :                         itemIndex++;
    1640                 :                     }
    1641                 :                 }
    1642 ECB             :             }
    1643                 :             /* When !continuescan, there can't be any more matches, so stop */
    1644 GIC    26841625 :             if (!continuescan)
    1645         6112312 :                 break;
    1646                 : 
    1647 CBC    20729313 :             offnum = OffsetNumberNext(offnum);
    1648 ECB             :         }
    1649                 : 
    1650                 :         /*
    1651                 :          * We don't need to visit page to the right when the high key
    1652                 :          * indicates that no more matches will be found there.
    1653                 :          *
    1654                 :          * Checking the high key like this works out more often than you might
    1655                 :          * think.  Leaf page splits pick a split point between the two most
    1656                 :          * dissimilar tuples (this is weighed against the need to evenly share
    1657                 :          * free space).  Leaf pages with high key attribute values that can
    1658                 :          * only appear on non-pivot tuples on the right sibling page are
    1659                 :          * common.
    1660                 :          */
    1661 GIC     8691103 :         if (continuescan && !P_RIGHTMOST(opaque))
    1662                 :         {
    1663           62692 :             ItemId      iid = PageGetItemId(page, P_HIKEY);
    1664 CBC       62692 :             IndexTuple  itup = (IndexTuple) PageGetItem(page, iid);
    1665                 :             int         truncatt;
    1666 ECB             : 
    1667 CBC       62692 :             truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
    1668 GIC       62692 :             _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
    1669                 :         }
    1670 ECB             : 
    1671 CBC     8691103 :         if (!continuescan)
    1672 GIC     6147219 :             so->currPos.moreRight = false;
    1673                 : 
    1674 CBC     8691103 :         Assert(itemIndex <= MaxTIDsPerBTreePage);
    1675         8691103 :         so->currPos.firstItem = 0;
    1676 GIC     8691103 :         so->currPos.lastItem = itemIndex - 1;
    1677 CBC     8691103 :         so->currPos.itemIndex = 0;
    1678 ECB             :     }
    1679                 :     else
    1680                 :     {
    1681                 :         /* load items[] in descending order */
    1682 GIC       21605 :         itemIndex = MaxTIDsPerBTreePage;
    1683                 : 
    1684           21605 :         offnum = Min(offnum, maxoff);
    1685 ECB             : 
    1686 GIC     4336927 :         while (offnum >= minoff)
    1687 ECB             :         {
    1688 GIC     4315364 :             ItemId      iid = PageGetItemId(page, offnum);
    1689 ECB             :             IndexTuple  itup;
    1690                 :             bool        tuple_alive;
    1691                 :             bool        passes_quals;
    1692                 : 
    1693                 :             /*
    1694                 :              * If the scan specifies not to return killed tuples, then we
    1695                 :              * treat a killed tuple as not passing the qual.  Most of the
    1696                 :              * time, it's a win to not bother examining the tuple's index
    1697                 :              * keys, but just skip to the next tuple (previous, actually,
    1698                 :              * since we're scanning backwards).  However, if this is the first
    1699                 :              * tuple on the page, we do check the index keys, to prevent
    1700                 :              * uselessly advancing to the page to the left.  This is similar
    1701                 :              * to the high key optimization used by forward scans.
    1702                 :              */
    1703 GIC     4315364 :             if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    1704                 :             {
    1705          490854 :                 Assert(offnum >= P_FIRSTDATAKEY(opaque));
    1706 CBC      490854 :                 if (offnum > P_FIRSTDATAKEY(opaque))
    1707                 :                 {
    1708          490675 :                     offnum = OffsetNumberPrev(offnum);
    1709          490675 :                     continue;
    1710                 :                 }
    1711 ECB             : 
    1712 CBC         179 :                 tuple_alive = false;
    1713                 :             }
    1714                 :             else
    1715         3824510 :                 tuple_alive = true;
    1716                 : 
    1717 GIC     3824689 :             itup = (IndexTuple) PageGetItem(page, iid);
    1718 ECB             : 
    1719 GIC     3824689 :             passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
    1720 ECB             :                                          &continuescan);
    1721 GIC     3824689 :             if (passes_quals && tuple_alive)
    1722 ECB             :             {
    1723                 :                 /* tuple passes all scan key conditions */
    1724 CBC     3824468 :                 if (!BTreeTupleIsPosting(itup))
    1725                 :                 {
    1726                 :                     /* Remember it */
    1727         3798388 :                     itemIndex--;
    1728 GIC     3798388 :                     _bt_saveitem(so, itemIndex, offnum, itup);
    1729                 :                 }
    1730 ECB             :                 else
    1731                 :                 {
    1732                 :                     int         tupleOffset;
    1733                 : 
    1734                 :                     /*
    1735                 :                      * Set up state to return posting list, and remember first
    1736                 :                      * TID.
    1737                 :                      *
    1738                 :                      * Note that we deliberately save/return items from
    1739                 :                      * posting lists in ascending heap TID order for backwards
    1740                 :                      * scans.  This allows _bt_killitems() to make a
    1741                 :                      * consistent assumption about the order of items
    1742                 :                      * associated with the same posting list tuple.
    1743                 :                      */
    1744 GIC       26080 :                     itemIndex--;
    1745                 :                     tupleOffset =
    1746           26080 :                         _bt_setuppostingitems(so, itemIndex, offnum,
    1747 ECB             :                                               BTreeTupleGetPostingN(itup, 0),
    1748                 :                                               itup);
    1749                 :                     /* Remember additional TIDs */
    1750 GIC       76589 :                     for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
    1751                 :                     {
    1752           50509 :                         itemIndex--;
    1753 CBC       50509 :                         _bt_savepostingitem(so, itemIndex, offnum,
    1754                 :                                             BTreeTupleGetPostingN(itup, i),
    1755 ECB             :                                             tupleOffset);
    1756                 :                     }
    1757                 :                 }
    1758                 :             }
    1759 GIC     3824689 :             if (!continuescan)
    1760                 :             {
    1761                 :                 /* there can't be any more matches, so stop */
    1762 CBC          42 :                 so->currPos.moreLeft = false;
    1763 GIC          42 :                 break;
    1764                 :             }
    1765 ECB             : 
    1766 CBC     3824647 :             offnum = OffsetNumberPrev(offnum);
    1767                 :         }
    1768                 : 
    1769           21605 :         Assert(itemIndex >= 0);
    1770 GIC       21605 :         so->currPos.firstItem = itemIndex;
    1771           21605 :         so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
    1772 CBC       21605 :         so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
    1773 ECB             :     }
    1774                 : 
    1775 CBC     8712708 :     return (so->currPos.firstItem <= so->currPos.lastItem);
    1776                 : }
    1777                 : 
    1778 ECB             : /* Save an index item into so->currPos.items[itemIndex] */
    1779                 : static void
    1780 GIC    24070875 : _bt_saveitem(BTScanOpaque so, int itemIndex,
    1781                 :              OffsetNumber offnum, IndexTuple itup)
    1782                 : {
    1783 CBC    24070875 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1784                 : 
    1785 GIC    24070875 :     Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
    1786 ECB             : 
    1787 GIC    24070875 :     currItem->heapTid = itup->t_tid;
    1788 CBC    24070875 :     currItem->indexOffset = offnum;
    1789 GIC    24070875 :     if (so->currTuples)
    1790 ECB             :     {
    1791 CBC     9999342 :         Size        itupsz = IndexTupleSize(itup);
    1792 ECB             : 
    1793 GIC     9999342 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1794 CBC     9999342 :         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
    1795 GIC     9999342 :         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    1796 ECB             :     }
    1797 CBC    24070875 : }
    1798 ECB             : 
    1799                 : /*
    1800                 :  * Setup state to save TIDs/items from a single posting list tuple.
    1801                 :  *
    1802                 :  * Saves an index item into so->currPos.items[itemIndex] for TID that is
    1803                 :  * returned to scan first.  Second or subsequent TIDs for posting list should
    1804                 :  * be saved by calling _bt_savepostingitem().
    1805                 :  *
    1806                 :  * Returns an offset into tuple storage space that main tuple is stored at if
    1807                 :  * needed.
    1808                 :  */
    1809                 : static int
    1810 GIC      310488 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1811                 :                       ItemPointer heapTid, IndexTuple itup)
    1812                 : {
    1813 CBC      310488 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1814                 : 
    1815 GIC      310488 :     Assert(BTreeTupleIsPosting(itup));
    1816 ECB             : 
    1817 GIC      310488 :     currItem->heapTid = *heapTid;
    1818 CBC      310488 :     currItem->indexOffset = offnum;
    1819 GIC      310488 :     if (so->currTuples)
    1820 ECB             :     {
    1821                 :         /* Save base IndexTuple (truncate posting list) */
    1822                 :         IndexTuple  base;
    1823 GIC      133414 :         Size        itupsz = BTreeTupleGetPostingOffset(itup);
    1824                 : 
    1825          133414 :         itupsz = MAXALIGN(itupsz);
    1826 CBC      133414 :         currItem->tupleOffset = so->currPos.nextTupleOffset;
    1827 GIC      133414 :         base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
    1828 CBC      133414 :         memcpy(base, itup, itupsz);
    1829 ECB             :         /* Defensively reduce work area index tuple header size */
    1830 CBC      133414 :         base->t_info &= ~INDEX_SIZE_MASK;
    1831          133414 :         base->t_info |= itupsz;
    1832 GIC      133414 :         so->currPos.nextTupleOffset += itupsz;
    1833 ECB             : 
    1834 CBC      133414 :         return currItem->tupleOffset;
    1835 ECB             :     }
    1836                 : 
    1837 CBC      177074 :     return 0;
    1838                 : }
    1839                 : 
    1840 ECB             : /*
    1841                 :  * Save an index item into so->currPos.items[itemIndex] for current posting
    1842                 :  * tuple.
    1843                 :  *
    1844                 :  * Assumes that _bt_setuppostingitems() has already been called for current
    1845                 :  * posting list tuple.  Caller passes its return value as tupleOffset.
    1846                 :  */
    1847                 : static inline void
    1848 GIC     1210254 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
    1849                 :                     ItemPointer heapTid, int tupleOffset)
    1850                 : {
    1851 CBC     1210254 :     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
    1852                 : 
    1853 GIC     1210254 :     currItem->heapTid = *heapTid;
    1854 CBC     1210254 :     currItem->indexOffset = offnum;
    1855                 : 
    1856 ECB             :     /*
    1857                 :      * Have index-only scans return the same base IndexTuple for every TID
    1858                 :      * that originates from the same posting list
    1859                 :      */
    1860 GIC     1210254 :     if (so->currTuples)
    1861          465558 :         currItem->tupleOffset = tupleOffset;
    1862         1210254 : }
    1863 ECB             : 
    1864                 : /*
    1865                 :  *  _bt_steppage() -- Step to next page containing valid data for scan
    1866                 :  *
    1867                 :  * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
    1868                 :  * if pinned, we'll drop the pin before moving to next page.  The buffer is
    1869                 :  * not locked on entry.
    1870                 :  *
    1871                 :  * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
    1872                 :  * read lock, on that page.  If we do not hold the pin, we set so->currPos.buf
    1873                 :  * to InvalidBuffer.  We return true to indicate success.
    1874                 :  */
    1875                 : static bool
    1876 GIC     4851051 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
    1877                 : {
    1878         4851051 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1879 CBC     4851051 :     BlockNumber blkno = InvalidBlockNumber;
    1880                 :     bool        status;
    1881 ECB             : 
    1882 CBC     4851051 :     Assert(BTScanPosIsValid(so->currPos));
    1883                 : 
    1884                 :     /* Before leaving current page, deal with any killed items */
    1885         4851051 :     if (so->numKilled > 0)
    1886 GIC       39535 :         _bt_killitems(scan);
    1887                 : 
    1888 ECB             :     /*
    1889                 :      * Before we modify currPos, make a copy of the page data if there was a
    1890                 :      * mark position that needs it.
    1891                 :      */
    1892 GIC     4851051 :     if (so->markItemIndex >= 0)
    1893                 :     {
    1894                 :         /* bump pin on current buffer for assignment to mark buffer */
    1895 CBC         183 :         if (BTScanPosIsPinned(so->currPos))
    1896 GIC         175 :             IncrBufferRefCount(so->currPos.buf);
    1897             183 :         memcpy(&so->markPos, &so->currPos,
    1898 ECB             :                offsetof(BTScanPosData, items[1]) +
    1899 CBC         183 :                so->currPos.lastItem * sizeof(BTScanPosItem));
    1900             183 :         if (so->markTuples)
    1901 GIC         174 :             memcpy(so->markTuples, so->currTuples,
    1902 CBC         174 :                    so->currPos.nextTupleOffset);
    1903             183 :         so->markPos.itemIndex = so->markItemIndex;
    1904             183 :         so->markItemIndex = -1;
    1905 ECB             :     }
    1906                 : 
    1907 CBC     4851051 :     if (ScanDirectionIsForward(dir))
    1908                 :     {
    1909                 :         /* Walk right to the next page with data */
    1910         4851005 :         if (scan->parallel_scan != NULL)
    1911                 :         {
    1912                 :             /*
    1913 ECB             :              * Seize the scan to get the next block number; if the scan has
    1914                 :              * ended already, bail out.
    1915                 :              */
    1916 GIC         644 :             status = _bt_parallel_seize(scan, &blkno);
    1917             644 :             if (!status)
    1918                 :             {
    1919 ECB             :                 /* release the previous buffer, if pinned */
    1920 CBC           6 :                 BTScanPosUnpinIfPinned(so->currPos);
    1921 GIC           6 :                 BTScanPosInvalidate(so->currPos);
    1922               6 :                 return false;
    1923 ECB             :             }
    1924                 :         }
    1925                 :         else
    1926                 :         {
    1927                 :             /* Not parallel, so use the previously-saved nextPage link. */
    1928 GIC     4850361 :             blkno = so->currPos.nextPage;
    1929                 :         }
    1930                 : 
    1931 ECB             :         /* Remember we left a page with data */
    1932 GIC     4850999 :         so->currPos.moreLeft = true;
    1933                 : 
    1934                 :         /* release the previous buffer, if pinned */
    1935 CBC     4850999 :         BTScanPosUnpinIfPinned(so->currPos);
    1936                 :     }
    1937                 :     else
    1938 ECB             :     {
    1939                 :         /* Remember we left a page with data */
    1940 GIC          46 :         so->currPos.moreRight = true;
    1941                 : 
    1942              46 :         if (scan->parallel_scan != NULL)
    1943 ECB             :         {
    1944                 :             /*
    1945                 :              * Seize the scan to get the current block number; if the scan has
    1946                 :              * ended already, bail out.
    1947                 :              */
    1948 UIC           0 :             status = _bt_parallel_seize(scan, &blkno);
    1949               0 :             BTScanPosUnpinIfPinned(so->currPos);
    1950               0 :             if (!status)
    1951 EUB             :             {
    1952 UBC           0 :                 BTScanPosInvalidate(so->currPos);
    1953               0 :                 return false;
    1954                 :             }
    1955 EUB             :         }
    1956                 :         else
    1957                 :         {
    1958                 :             /* Not parallel, so just use our own notion of the current page */
    1959 GIC          46 :             blkno = so->currPos.currPage;
    1960                 :         }
    1961                 :     }
    1962 ECB             : 
    1963 GIC     4851045 :     if (!_bt_readnextpage(scan, blkno, dir))
    1964         4836686 :         return false;
    1965                 : 
    1966 ECB             :     /* Drop the lock, and maybe the pin, on the current page */
    1967 CBC       14359 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    1968                 : 
    1969 GIC       14359 :     return true;
    1970 ECB             : }
    1971                 : 
    1972                 : /*
    1973                 :  *  _bt_readnextpage() -- Read next page containing valid data for scan
    1974                 :  *
    1975                 :  * On success exit, so->currPos is updated to contain data from the next
    1976                 :  * interesting page.  Caller is responsible to release lock and pin on
    1977                 :  * buffer on success.  We return true to indicate success.
    1978                 :  *
    1979                 :  * If there are no more matching records in the given direction, we drop all
    1980                 :  * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
    1981                 :  */
    1982                 : static bool
    1983 GIC     4851049 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    1984                 : {
    1985         4851049 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    1986 ECB             :     Relation    rel;
    1987                 :     Page        page;
    1988                 :     BTPageOpaque opaque;
    1989                 :     bool        status;
    1990                 : 
    1991 GIC     4851049 :     rel = scan->indexRelation;
    1992                 : 
    1993         4851049 :     if (ScanDirectionIsForward(dir))
    1994 ECB             :     {
    1995                 :         for (;;)
    1996                 :         {
    1997                 :             /*
    1998                 :              * if we're at end of scan, give up and mark parallel scan as
    1999                 :              * done, so that all the workers can finish their scan
    2000                 :              */
    2001 GIC     4851755 :             if (blkno == P_NONE || !so->currPos.moreRight)
    2002                 :             {
    2003         4836652 :                 _bt_parallel_done(scan);
    2004 CBC     4836652 :                 BTScanPosInvalidate(so->currPos);
    2005 GIC     4836652 :                 return false;
    2006 ECB             :             }
    2007                 :             /* check for interrupts while we're not holding any buffer lock */
    2008 CBC       15103 :             CHECK_FOR_INTERRUPTS();
    2009                 :             /* step right one page */
    2010 GNC       15103 :             so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ);
    2011 CBC       15103 :             page = BufferGetPage(so->currPos.buf);
    2012 GIC       15103 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    2013 CBC       15103 :             opaque = BTPageGetOpaque(page);
    2014 ECB             :             /* check for deleted page */
    2015 CBC       15103 :             if (!P_IGNORE(opaque))
    2016 ECB             :             {
    2017 GIC       15103 :                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
    2018 ECB             :                 /* see if there are any matches on this page */
    2019                 :                 /* note that this will clear moreRight if we can stop */
    2020 CBC       15103 :                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
    2021 GIC       14351 :                     break;
    2022                 :             }
    2023 LBC           0 :             else if (scan->parallel_scan != NULL)
    2024 ECB             :             {
    2025                 :                 /* allow next page be processed by parallel worker */
    2026 UBC           0 :                 _bt_parallel_release(scan, opaque->btpo_next);
    2027                 :             }
    2028                 : 
    2029 EUB             :             /* nope, keep going */
    2030 GIC         752 :             if (scan->parallel_scan != NULL)
    2031                 :             {
    2032 UIC           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2033 LBC           0 :                 status = _bt_parallel_seize(scan, &blkno);
    2034 UIC           0 :                 if (!status)
    2035 EUB             :                 {
    2036 UBC           0 :                     BTScanPosInvalidate(so->currPos);
    2037               0 :                     return false;
    2038                 :                 }
    2039 EUB             :             }
    2040                 :             else
    2041                 :             {
    2042 GIC         752 :                 blkno = opaque->btpo_next;
    2043             752 :                 _bt_relbuf(rel, so->currPos.buf);
    2044                 :             }
    2045 ECB             :         }
    2046                 :     }
    2047                 :     else
    2048                 :     {
    2049                 :         /*
    2050                 :          * Should only happen in parallel cases, when some other backend
    2051                 :          * advanced the scan.
    2052                 :          */
    2053 GIC          46 :         if (so->currPos.currPage != blkno)
    2054                 :         {
    2055 UIC           0 :             BTScanPosUnpinIfPinned(so->currPos);
    2056 LBC           0 :             so->currPos.currPage = blkno;
    2057                 :         }
    2058 EUB             : 
    2059                 :         /*
    2060                 :          * Walk left to the next page with data.  This is much more complex
    2061                 :          * than the walk-right case because of the possibility that the page
    2062                 :          * to our left splits while we are in flight to it, plus the
    2063                 :          * possibility that the page we were on gets deleted after we leave
    2064                 :          * it.  See nbtree/README for details.
    2065                 :          *
    2066                 :          * It might be possible to rearrange this code to have less overhead
    2067                 :          * in pinning and locking, but that would require capturing the left
    2068                 :          * pointer when the page is initially read, and using it here, along
    2069                 :          * with big changes to _bt_walk_left() and the code below.  It is not
    2070                 :          * clear whether this would be a win, since if the page immediately to
    2071                 :          * the left splits after we read this page and before we step left, we
    2072                 :          * would need to visit more pages than with the current code.
    2073                 :          *
    2074                 :          * Note that if we change the code so that we drop the pin for a scan
    2075                 :          * which uses a non-MVCC snapshot, we will need to modify the code for
    2076                 :          * walking left, to allow for the possibility that a referenced page
    2077                 :          * has been deleted.  As long as the buffer is pinned or the snapshot
    2078                 :          * is MVCC the page cannot move past the half-dead state to fully
    2079                 :          * deleted.
    2080                 :          */
    2081 GIC          46 :         if (BTScanPosIsPinned(so->currPos))
    2082              24 :             _bt_lockbuf(rel, so->currPos.buf, BT_READ);
    2083                 :         else
    2084 GNC          22 :             so->currPos.buf = _bt_getbuf(rel, scan->heapRelation,
    2085                 :                                          so->currPos.currPage, BT_READ);
    2086 ECB             : 
    2087                 :         for (;;)
    2088                 :         {
    2089                 :             /* Done if we know there are no matching keys to the left */
    2090 GIC          48 :             if (!so->currPos.moreLeft)
    2091                 :             {
    2092              21 :                 _bt_relbuf(rel, so->currPos.buf);
    2093              21 :                 _bt_parallel_done(scan);
    2094 CBC          21 :                 BTScanPosInvalidate(so->currPos);
    2095 GIC          21 :                 return false;
    2096 ECB             :             }
    2097                 : 
    2098                 :             /* Step to next physical page */
    2099 GNC          27 :             so->currPos.buf = _bt_walk_left(rel, scan->heapRelation,
    2100                 :                                             so->currPos.buf, scan->xs_snapshot);
    2101                 : 
    2102                 :             /* if we're physically at end of index, return failure */
    2103 CBC          27 :             if (so->currPos.buf == InvalidBuffer)
    2104                 :             {
    2105 GIC          13 :                 _bt_parallel_done(scan);
    2106              13 :                 BTScanPosInvalidate(so->currPos);
    2107 CBC          13 :                 return false;
    2108                 :             }
    2109 ECB             : 
    2110                 :             /*
    2111                 :              * Okay, we managed to move left to a non-deleted page. Done if
    2112                 :              * it's not half-dead and contains matching tuples. Else loop back
    2113                 :              * and do it all again.
    2114                 :              */
    2115 GIC          14 :             page = BufferGetPage(so->currPos.buf);
    2116              14 :             TestForOldSnapshot(scan->xs_snapshot, rel, page);
    2117              14 :             opaque = BTPageGetOpaque(page);
    2118              14 :             if (!P_IGNORE(opaque))
    2119 ECB             :             {
    2120 CBC          14 :                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
    2121 ECB             :                 /* see if there are any matches on this page */
    2122                 :                 /* note that this will clear moreLeft if we can stop */
    2123 GIC          14 :                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
    2124 CBC          12 :                     break;
    2125                 :             }
    2126 UIC           0 :             else if (scan->parallel_scan != NULL)
    2127 ECB             :             {
    2128                 :                 /* allow next page be processed by parallel worker */
    2129 UIC           0 :                 _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
    2130 EUB             :             }
    2131                 : 
    2132                 :             /*
    2133                 :              * For parallel scans, get the last page scanned as it is quite
    2134                 :              * possible that by the time we try to seize the scan, some other
    2135                 :              * worker has already advanced the scan to a different page.  We
    2136                 :              * must continue based on the latest page scanned by any worker.
    2137                 :              */
    2138 GIC           2 :             if (scan->parallel_scan != NULL)
    2139                 :             {
    2140 UIC           0 :                 _bt_relbuf(rel, so->currPos.buf);
    2141               0 :                 status = _bt_parallel_seize(scan, &blkno);
    2142 LBC           0 :                 if (!status)
    2143                 :                 {
    2144 UBC           0 :                     BTScanPosInvalidate(so->currPos);
    2145               0 :                     return false;
    2146 EUB             :                 }
    2147 UNC           0 :                 so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno,
    2148                 :                                              BT_READ);
    2149 EUB             :             }
    2150                 :         }
    2151                 :     }
    2152                 : 
    2153 GIC       14363 :     return true;
    2154                 : }
    2155                 : 
    2156                 : /*
    2157                 :  *  _bt_parallel_readpage() -- Read current page containing valid data for scan
    2158 ECB             :  *
    2159                 :  * On success, release lock and maybe pin on buffer.  We return true to
    2160                 :  * indicate success.
    2161                 :  */
    2162                 : static bool
    2163 GIC           4 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
    2164                 : {
    2165               4 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2166                 : 
    2167               4 :     _bt_initialize_more_data(so, dir);
    2168 ECB             : 
    2169 GIC           4 :     if (!_bt_readnextpage(scan, blkno, dir))
    2170 LBC           0 :         return false;
    2171                 : 
    2172 ECB             :     /* Drop the lock, and maybe the pin, on the current page */
    2173 GIC           4 :     _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2174 ECB             : 
    2175 GBC           4 :     return true;
    2176                 : }
    2177                 : 
    2178 ECB             : /*
    2179                 :  * _bt_walk_left() -- step left one page, if possible
    2180                 :  *
    2181                 :  * The given buffer must be pinned and read-locked.  This will be dropped
    2182                 :  * before stepping left.  On return, we have pin and read lock on the
    2183                 :  * returned page, instead.
    2184                 :  *
    2185                 :  * Returns InvalidBuffer if there is no page to the left (no lock is held
    2186                 :  * in that case).
    2187                 :  *
    2188                 :  * When working on a non-leaf level, it is possible for the returned page
    2189                 :  * to be half-dead; the caller should check that condition and step left
    2190                 :  * again if it's important.
    2191                 :  */
    2192                 : static Buffer
    2193 GNC          27 : _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot)
    2194                 : {
    2195                 :     Page        page;
    2196                 :     BTPageOpaque opaque;
    2197                 : 
    2198 CBC          27 :     page = BufferGetPage(buf);
    2199 GIC          27 :     opaque = BTPageGetOpaque(page);
    2200                 : 
    2201                 :     for (;;)
    2202 UIC           0 :     {
    2203 ECB             :         BlockNumber obknum;
    2204                 :         BlockNumber lblkno;
    2205                 :         BlockNumber blkno;
    2206                 :         int         tries;
    2207 EUB             : 
    2208                 :         /* if we're at end of tree, release buf and return failure */
    2209 GIC          27 :         if (P_LEFTMOST(opaque))
    2210                 :         {
    2211              13 :             _bt_relbuf(rel, buf);
    2212              13 :             break;
    2213                 :         }
    2214 ECB             :         /* remember original page we are stepping left from */
    2215 GIC          14 :         obknum = BufferGetBlockNumber(buf);
    2216 ECB             :         /* step left */
    2217 CBC          14 :         blkno = lblkno = opaque->btpo_prev;
    2218 GIC          14 :         _bt_relbuf(rel, buf);
    2219                 :         /* check for interrupts while we're not holding any buffer lock */
    2220 CBC          14 :         CHECK_FOR_INTERRUPTS();
    2221 GNC          14 :         buf = _bt_getbuf(rel, heaprel, blkno, BT_READ);
    2222 CBC          14 :         page = BufferGetPage(buf);
    2223              14 :         TestForOldSnapshot(snapshot, rel, page);
    2224 GIC          14 :         opaque = BTPageGetOpaque(page);
    2225 ECB             : 
    2226                 :         /*
    2227                 :          * If this isn't the page we want, walk right till we find what we
    2228                 :          * want --- but go no more than four hops (an arbitrary limit). If we
    2229                 :          * don't find the correct page by then, the most likely bet is that
    2230                 :          * the original page got deleted and isn't in the sibling chain at all
    2231                 :          * anymore, not that its left sibling got split more than four times.
    2232                 :          *
    2233                 :          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
    2234                 :          * because half-dead pages are still in the sibling chain.  Caller
    2235                 :          * must reject half-dead pages if wanted.
    2236                 :          */
    2237 GIC          14 :         tries = 0;
    2238                 :         for (;;)
    2239                 :         {
    2240              14 :             if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
    2241                 :             {
    2242 ECB             :                 /* Found desired page, return it */
    2243 GIC          14 :                 return buf;
    2244                 :             }
    2245 LBC           0 :             if (P_RIGHTMOST(opaque) || ++tries > 4)
    2246                 :                 break;
    2247 UIC           0 :             blkno = opaque->btpo_next;
    2248 LBC           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2249 UIC           0 :             page = BufferGetPage(buf);
    2250 UBC           0 :             TestForOldSnapshot(snapshot, rel, page);
    2251 UIC           0 :             opaque = BTPageGetOpaque(page);
    2252 EUB             :         }
    2253                 : 
    2254                 :         /* Return to the original page to see what's up */
    2255 UBC           0 :         buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
    2256               0 :         page = BufferGetPage(buf);
    2257 UIC           0 :         TestForOldSnapshot(snapshot, rel, page);
    2258               0 :         opaque = BTPageGetOpaque(page);
    2259               0 :         if (P_ISDELETED(opaque))
    2260 EUB             :         {
    2261                 :             /*
    2262                 :              * It was deleted.  Move right to first nondeleted page (there
    2263                 :              * must be one); that is the page that has acquired the deleted
    2264                 :              * one's keyspace, so stepping left from it will take us where we
    2265                 :              * want to be.
    2266                 :              */
    2267                 :             for (;;)
    2268                 :             {
    2269 UIC           0 :                 if (P_RIGHTMOST(opaque))
    2270               0 :                     elog(ERROR, "fell off the end of index \"%s\"",
    2271                 :                          RelationGetRelationName(rel));
    2272               0 :                 blkno = opaque->btpo_next;
    2273               0 :                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2274 UBC           0 :                 page = BufferGetPage(buf);
    2275               0 :                 TestForOldSnapshot(snapshot, rel, page);
    2276 UIC           0 :                 opaque = BTPageGetOpaque(page);
    2277 UBC           0 :                 if (!P_ISDELETED(opaque))
    2278               0 :                     break;
    2279 EUB             :             }
    2280                 : 
    2281                 :             /*
    2282                 :              * Now return to top of loop, resetting obknum to point to this
    2283                 :              * nondeleted page, and try again.
    2284                 :              */
    2285                 :         }
    2286                 :         else
    2287                 :         {
    2288                 :             /*
    2289                 :              * It wasn't deleted; the explanation had better be that the page
    2290                 :              * to the left got split or deleted. Without this check, we'd go
    2291                 :              * into an infinite loop if there's anything wrong.
    2292                 :              */
    2293 UIC           0 :             if (opaque->btpo_prev == lblkno)
    2294               0 :                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
    2295                 :                      obknum, RelationGetRelationName(rel));
    2296                 :             /* Okay to try again with new lblkno value */
    2297                 :         }
    2298 EUB             :     }
    2299                 : 
    2300 GIC          13 :     return InvalidBuffer;
    2301                 : }
    2302                 : 
    2303                 : /*
    2304                 :  * _bt_get_endpoint() -- Find the first or last page on a given tree level
    2305 ECB             :  *
    2306                 :  * If the index is empty, we will return InvalidBuffer; any other failure
    2307                 :  * condition causes ereport().  We will not return a dead page.
    2308                 :  *
    2309                 :  * The returned buffer is pinned and read-locked.
    2310                 :  */
    2311                 : Buffer
    2312 GNC       31969 : _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost,
    2313                 :                  Snapshot snapshot)
    2314                 : {
    2315                 :     Buffer      buf;
    2316                 :     Page        page;
    2317 ECB             :     BTPageOpaque opaque;
    2318                 :     OffsetNumber offnum;
    2319                 :     BlockNumber blkno;
    2320                 :     IndexTuple  itup;
    2321                 : 
    2322                 :     /*
    2323                 :      * If we are looking for a leaf page, okay to descend from fast root;
    2324                 :      * otherwise better descend from true root.  (There is no point in being
    2325                 :      * smarter about intermediate levels.)
    2326                 :      */
    2327 GIC       31969 :     if (level == 0)
    2328 GNC       31957 :         buf = _bt_getroot(rel, heaprel, BT_READ);
    2329                 :     else
    2330              12 :         buf = _bt_gettrueroot(rel, heaprel);
    2331                 : 
    2332 CBC       31969 :     if (!BufferIsValid(buf))
    2333            3197 :         return InvalidBuffer;
    2334                 : 
    2335           28772 :     page = BufferGetPage(buf);
    2336 GIC       28772 :     TestForOldSnapshot(snapshot, rel, page);
    2337 CBC       28772 :     opaque = BTPageGetOpaque(page);
    2338 ECB             : 
    2339                 :     for (;;)
    2340                 :     {
    2341                 :         /*
    2342                 :          * If we landed on a deleted page, step right to find a live page
    2343                 :          * (there must be one).  Also, if we want the rightmost page, step
    2344                 :          * right if needed to get to it (this could happen if the page split
    2345                 :          * since we obtained a pointer to it).
    2346                 :          */
    2347 GIC       42145 :         while (P_IGNORE(opaque) ||
    2348              27 :                (rightmost && !P_RIGHTMOST(opaque)))
    2349                 :         {
    2350 UIC           0 :             blkno = opaque->btpo_next;
    2351               0 :             if (blkno == P_NONE)
    2352 LBC           0 :                 elog(ERROR, "fell off the end of index \"%s\"",
    2353 ECB             :                      RelationGetRelationName(rel));
    2354 UIC           0 :             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2355 UBC           0 :             page = BufferGetPage(buf);
    2356               0 :             TestForOldSnapshot(snapshot, rel, page);
    2357               0 :             opaque = BTPageGetOpaque(page);
    2358                 :         }
    2359 EUB             : 
    2360                 :         /* Done? */
    2361 GBC       42145 :         if (opaque->btpo_level == level)
    2362           28772 :             break;
    2363 GIC       13373 :         if (opaque->btpo_level < level)
    2364 UIC           0 :             ereport(ERROR,
    2365                 :                     (errcode(ERRCODE_INDEX_CORRUPTED),
    2366 ECB             :                      errmsg_internal("btree level %u not found in index \"%s\"",
    2367                 :                                      level, RelationGetRelationName(rel))));
    2368                 : 
    2369 EUB             :         /* Descend to leftmost or rightmost child page */
    2370 GIC       13373 :         if (rightmost)
    2371               3 :             offnum = PageGetMaxOffsetNumber(page);
    2372                 :         else
    2373           13370 :             offnum = P_FIRSTDATAKEY(opaque);
    2374                 : 
    2375 CBC       13373 :         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
    2376           13373 :         blkno = BTreeTupleGetDownLink(itup);
    2377                 : 
    2378           13373 :         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
    2379 GIC       13373 :         page = BufferGetPage(buf);
    2380 CBC       13373 :         opaque = BTPageGetOpaque(page);
    2381 ECB             :     }
    2382                 : 
    2383 CBC       28772 :     return buf;
    2384 ECB             : }
    2385                 : 
    2386                 : /*
    2387                 :  *  _bt_endpoint() -- Find the first or last page in the index, and scan
    2388                 :  * from there to the first key satisfying all the quals.
    2389                 :  *
    2390                 :  * This is used by _bt_first() to set up a scan when we've determined
    2391                 :  * that the scan must start at the beginning or end of the index (for
    2392                 :  * a forward or backward scan respectively).  Exit conditions are the
    2393                 :  * same as for _bt_first().
    2394                 :  */
    2395                 : static bool
    2396 GIC       31957 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
    2397                 : {
    2398           31957 :     Relation    rel = scan->indexRelation;
    2399           31957 :     BTScanOpaque so = (BTScanOpaque) scan->opaque;
    2400                 :     Buffer      buf;
    2401 ECB             :     Page        page;
    2402                 :     BTPageOpaque opaque;
    2403                 :     OffsetNumber start;
    2404                 :     BTScanPosItem *currItem;
    2405                 : 
    2406                 :     /*
    2407                 :      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
    2408                 :      * version of _bt_search().  We don't maintain a stack since we know we
    2409                 :      * won't need it.
    2410                 :      */
    2411 GNC       31957 :     buf = _bt_get_endpoint(rel, scan->heapRelation, 0,
    2412                 :                            ScanDirectionIsBackward(dir), scan->xs_snapshot);
    2413                 : 
    2414 GIC       31957 :     if (!BufferIsValid(buf))
    2415                 :     {
    2416                 :         /*
    2417 ECB             :          * Empty index. Lock the whole relation, as nothing finer to lock
    2418                 :          * exists.
    2419                 :          */
    2420 CBC        3197 :         PredicateLockRelation(rel, scan->xs_snapshot);
    2421 GIC        3197 :         BTScanPosInvalidate(so->currPos);
    2422            3197 :         return false;
    2423                 :     }
    2424                 : 
    2425           28760 :     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    2426 CBC       28760 :     page = BufferGetPage(buf);
    2427           28760 :     opaque = BTPageGetOpaque(page);
    2428           28760 :     Assert(P_ISLEAF(opaque));
    2429                 : 
    2430 GIC       28760 :     if (ScanDirectionIsForward(dir))
    2431 ECB             :     {
    2432                 :         /* There could be dead pages to the left, so not this: */
    2433                 :         /* Assert(P_LEFTMOST(opaque)); */
    2434                 : 
    2435 GIC       28736 :         start = P_FIRSTDATAKEY(opaque);
    2436 ECB             :     }
    2437 GIC          24 :     else if (ScanDirectionIsBackward(dir))
    2438                 :     {
    2439              24 :         Assert(P_RIGHTMOST(opaque));
    2440                 : 
    2441 CBC          24 :         start = PageGetMaxOffsetNumber(page);
    2442                 :     }
    2443 ECB             :     else
    2444                 :     {
    2445 LBC           0 :         elog(ERROR, "invalid scan direction: %d", (int) dir);
    2446                 :         start = 0;              /* keep compiler quiet */
    2447 ECB             :     }
    2448                 : 
    2449                 :     /* remember which buffer we have pinned */
    2450 GIC       28760 :     so->currPos.buf = buf;
    2451 EUB             : 
    2452 GIC       28760 :     _bt_initialize_more_data(so, dir);
    2453                 : 
    2454                 :     /*
    2455                 :      * Now load data from the first page of the scan.
    2456 ECB             :      */
    2457 GIC       28760 :     if (!_bt_readpage(scan, dir, start))
    2458 ECB             :     {
    2459                 :         /*
    2460                 :          * There's no actually-matching data on this page.  Try to advance to
    2461                 :          * the next page.  Return false if there's no matching data at all.
    2462                 :          */
    2463 CBC         820 :         _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
    2464 GIC         820 :         if (!_bt_steppage(scan, dir))
    2465             781 :             return false;
    2466                 :     }
    2467                 :     else
    2468                 :     {
    2469 ECB             :         /* Drop the lock, and maybe the pin, on the current page */
    2470 CBC       27940 :         _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
    2471 ECB             :     }
    2472                 : 
    2473                 :     /* OK, itemIndex says what to return */
    2474 GIC       27979 :     currItem = &so->currPos.items[so->currPos.itemIndex];
    2475           27979 :     scan->xs_heaptid = currItem->heapTid;
    2476 CBC       27979 :     if (scan->xs_want_itup)
    2477 GIC       25940 :         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
    2478                 : 
    2479           27979 :     return true;
    2480 ECB             : }
    2481                 : 
    2482                 : /*
    2483                 :  * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
    2484                 :  * for scan direction
    2485                 :  */
    2486                 : static inline void
    2487 GIC     8697595 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
    2488                 : {
    2489                 :     /* initialize moreLeft/moreRight appropriately for scan direction */
    2490         8697595 :     if (ScanDirectionIsForward(dir))
    2491                 :     {
    2492         8676004 :         so->currPos.moreLeft = false;
    2493 CBC     8676004 :         so->currPos.moreRight = true;
    2494                 :     }
    2495                 :     else
    2496 ECB             :     {
    2497 GIC       21591 :         so->currPos.moreLeft = true;
    2498 CBC       21591 :         so->currPos.moreRight = false;
    2499 ECB             :     }
    2500 GIC     8697595 :     so->numKilled = 0;           /* just paranoia */
    2501         8697595 :     so->markItemIndex = -1;      /* ditto */
    2502         8697595 : }
        

Generated by: LCOV version v1.16-55-g56c0a2a