Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "miscadmin.h"
21 : #include "pgstat.h"
22 : #include "storage/predicate.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/rel.h"
25 :
26 :
27 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
30 : OffsetNumber offnum);
31 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32 : OffsetNumber offnum);
33 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34 : OffsetNumber offnum, IndexTuple itup);
35 : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36 : OffsetNumber offnum, ItemPointer heapTid,
37 : IndexTuple itup);
38 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39 : OffsetNumber offnum,
40 : ItemPointer heapTid, int tupleOffset);
41 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44 : ScanDirection dir);
45 : static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf,
46 : Snapshot snapshot);
47 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
48 : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
49 :
50 :
51 : /*
52 : * _bt_drop_lock_and_maybe_pin()
53 : *
54 : * Unlock the buffer; and if it is safe to release the pin, do that, too.
55 : * This will prevent vacuum from stalling in a blocked state trying to read a
56 : * page when a cursor is sitting on it.
57 : *
58 : * See nbtree/README section on making concurrent TID recycling safe.
59 : */
60 : static void
2937 kgrittn 61 GIC 5022230 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
2937 kgrittn 62 ECB : {
992 pg 63 GIC 5022230 : _bt_unlockbuf(scan->indexRelation, sp->buf);
2937 kgrittn 64 ECB :
2937 kgrittn 65 GIC 5022230 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
2937 kgrittn 66 CBC 4849383 : RelationNeedsWAL(scan->indexRelation) &&
67 4846854 : !scan->xs_want_itup)
2937 kgrittn 68 ECB : {
2937 kgrittn 69 GIC 4797883 : ReleaseBuffer(sp->buf);
2937 kgrittn 70 CBC 4797883 : sp->buf = InvalidBuffer;
2937 kgrittn 71 ECB : }
2937 kgrittn 72 GIC 5022230 : }
9770 scrappy 73 ECB :
74 : /*
75 : * _bt_search() -- Search the tree for a particular scankey,
76 : * or more precisely for the first leaf page it could be on.
77 : *
78 : * The passed scankey is an insertion-type scankey (see nbtree/README),
79 : * but it can omit the rightmost column(s) of the index.
80 : *
81 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
82 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
83 : * which is locked and pinned. No locks are held on the parent pages,
84 : * however!
85 : *
86 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
87 : * place during the descent through the tree. This is not needed when
88 : * positioning for an insert or delete, so NULL is used for those cases.
89 : *
90 : * The returned buffer is locked according to access parameter. Additionally,
91 : * access = BT_WRITE will allow an empty root page to be created and returned.
92 : * When access = BT_READ, an empty index will result in *bufP being set to
93 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
94 : * during the search will be finished.
95 : */
96 : BTStack
8 andres 97 GNC 16390240 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
98 : int access, Snapshot snapshot)
99 : {
8053 bruce 100 GIC 16390240 : BTStack stack_in = NULL;
1716 akorotkov 101 CBC 16390240 : int page_access = BT_READ;
9345 bruce 102 ECB :
103 : /* Get the root page to start with */
8 andres 104 GNC 16390240 : *bufP = _bt_getroot(rel, heaprel, access);
9345 bruce 105 ECB :
106 : /* If index is empty and access = BT_READ, no root page is created. */
8053 bruce 107 GIC 16390240 : if (!BufferIsValid(*bufP))
8297 tgl 108 CBC 385754 : return (BTStack) NULL;
9345 bruce 109 ECB :
110 : /* Loop iterates once per level descended in the tree */
111 : for (;;)
8297 tgl 112 GIC 12274663 : {
8297 tgl 113 ECB : Page page;
114 : BTPageOpaque opaque;
115 : OffsetNumber offnum;
116 : ItemId itemid;
117 : IndexTuple itup;
118 : BlockNumber child;
119 : BTStack new_stack;
120 :
121 : /*
122 : * Race -- the page we just grabbed may have split since we read its
123 : * downlink in its parent page (or the metapage). If it has, we may
124 : * need to move right to its new sibling. Do that.
125 : *
126 : * In write-mode, allow _bt_moveright to finish any incomplete splits
127 : * along the way. Strictly speaking, we'd only need to finish an
128 : * incomplete split on the leaf page we're about to insert to, not on
129 : * any of the upper levels (internal pages with incomplete splits are
130 : * also taken care of in _bt_getstackbuf). But this is a good
131 : * opportunity to finish splits of internal pages too.
132 : */
8 andres 133 GNC 28279149 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
134 : stack_in, page_access, snapshot);
135 :
136 : /* if this is a leaf page, we're done */
2545 kgrittn 137 GIC 28279149 : page = BufferGetPage(*bufP);
373 michael 138 CBC 28279149 : opaque = BTPageGetOpaque(page);
8297 tgl 139 28279149 : if (P_ISLEAF(opaque))
140 16004486 : break;
9345 bruce 141 ECB :
142 : /*
143 : * Find the appropriate pivot tuple on this page. Its downlink points
144 : * to the child page that we're about to descend to.
145 : */
1481 pg 146 GIC 12274663 : offnum = _bt_binsrch(rel, key, *bufP);
8297 tgl 147 CBC 12274663 : itemid = PageGetItemId(page, offnum);
6283 148 12274663 : itup = (IndexTuple) PageGetItem(page, itemid);
1138 pg 149 12274663 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
1011 150 12274663 : child = BTreeTupleGetDownLink(itup);
9345 bruce 151 ECB :
152 : /*
153 : * We need to save the location of the pivot tuple we chose in a new
154 : * stack entry for this page/level. If caller ends up splitting a
155 : * page one level down, it usually ends up inserting a new pivot
156 : * tuple/downlink immediately after the location recorded here.
157 : */
8297 tgl 158 GIC 12274663 : new_stack = (BTStack) palloc(sizeof(BTStackData));
1011 pg 159 CBC 12274663 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
8297 tgl 160 12274663 : new_stack->bts_offset = offnum;
161 12274663 : new_stack->bts_parent = stack_in;
9345 bruce 162 ECB :
163 : /*
164 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
165 : * we're on the level 1 and asked to lock leaf page in write mode,
166 : * then lock next page in write mode, because it must be a leaf.
167 : */
774 pg 168 GIC 12274663 : if (opaque->btpo_level == 1 && access == BT_WRITE)
1716 akorotkov 169 CBC 5665181 : page_access = BT_WRITE;
1716 akorotkov 170 ECB :
171 : /* drop the read lock on the page, then acquire one on its child */
1011 pg 172 GIC 12274663 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
9345 bruce 173 ECB :
174 : /* okay, all set to move down a level */
8297 tgl 175 GIC 12274663 : stack_in = new_stack;
8297 tgl 176 ECB : }
177 :
178 : /*
179 : * If we're asked to lock leaf in write mode, but didn't manage to, then
180 : * relock. This should only happen when the root page is a leaf page (and
181 : * the only page in the index other than the metapage).
182 : */
1716 akorotkov 183 GIC 16004486 : if (access == BT_WRITE && page_access == BT_READ)
1716 akorotkov 184 ECB : {
185 : /* trade in our read lock for a write lock */
992 pg 186 GIC 1468243 : _bt_unlockbuf(rel, *bufP);
992 pg 187 CBC 1468243 : _bt_lockbuf(rel, *bufP, BT_WRITE);
1716 akorotkov 188 ECB :
189 : /*
190 : * Race -- the leaf page may have split after we dropped the read lock
191 : * but before we acquired a write lock. If it has, we may need to
192 : * move right to its new sibling. Do that.
193 : */
8 andres 194 GNC 1468243 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
1481 pg 195 ECB : snapshot);
196 : }
197 :
8297 tgl 198 GIC 16004486 : return stack_in;
9770 scrappy 199 ECB : }
200 :
201 : /*
202 : * _bt_moveright() -- move right in the btree if necessary.
203 : *
204 : * When we follow a pointer to reach a page, it is possible that
205 : * the page has changed in the meanwhile. If this happens, we're
206 : * guaranteed that the page has "split right" -- that is, that any
207 : * data that appeared on the page originally is either on the page
208 : * or strictly to the right of it.
209 : *
210 : * This routine decides whether or not we need to move right in the
211 : * tree by examining the high key entry on the page. If that entry is
212 : * strictly less than the scankey, or <= the scankey in the
213 : * key.nextkey=true case, then we followed the wrong link and we need
214 : * to move right.
215 : *
216 : * The passed insertion-type scankey can omit the rightmost column(s) of the
217 : * index. (see nbtree/README)
218 : *
219 : * When key.nextkey is false (the usual case), we are looking for the first
220 : * item >= key. When key.nextkey is true, we are looking for the first item
221 : * strictly greater than key.
222 : *
223 : * If forupdate is true, we will attempt to finish any incomplete splits
224 : * that we encounter. This is required when locking a target page for an
225 : * insertion, because we don't allow inserting on a page before the split
226 : * is completed. 'stack' is only used if forupdate is true.
227 : *
228 : * On entry, we have the buffer pinned and a lock of the type specified by
229 : * 'access'. If we move right, we release the buffer and lock and acquire
230 : * the same on the right sibling. Return value is the buffer we stop at.
231 : *
232 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
233 : * place during the descent through the tree. This is not needed when
234 : * positioning for an insert or delete, so NULL is used for those cases.
235 : */
236 : Buffer
9770 scrappy 237 GIC 29747392 : _bt_moveright(Relation rel,
238 : Relation heaprel,
1481 pg 239 ECB : BTScanInsert key,
240 : Buffer buf,
241 : bool forupdate,
242 : BTStack stack,
243 : int access,
244 : Snapshot snapshot)
245 : {
246 : Page page;
247 : BTPageOpaque opaque;
248 : int32 cmpval;
249 :
250 : /*
251 : * When nextkey = false (normal case): if the scan key that brought us to
252 : * this page is > the high key stored on the page, then the page has split
253 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
254 : * have some duplicates to the right as well as the left, but that's
255 : * something that's only ever dealt with on the leaf level, after
256 : * _bt_search has found an initial leaf page.)
257 : *
258 : * When nextkey = true: move right if the scan key is >= page's high key.
259 : * (Note that key.scantid cannot be set in this case.)
260 : *
261 : * The page could even have split more than once, so scan as far as
262 : * needed.
263 : *
264 : * We also have to move right if we followed a link that brought us to a
265 : * dead page.
266 : */
1481 pg 267 GIC 29747392 : cmpval = key->nextkey ? 0 : 1;
268 :
3309 heikki.linnakangas 269 ECB : for (;;)
270 : {
2545 kgrittn 271 GIC 29748068 : page = BufferGetPage(buf);
272 29748068 : TestForOldSnapshot(snapshot, rel, page);
373 michael 273 CBC 29748068 : opaque = BTPageGetOpaque(page);
3309 heikki.linnakangas 274 ECB :
3309 heikki.linnakangas 275 CBC 29748068 : if (P_RIGHTMOST(opaque))
3309 heikki.linnakangas 276 GIC 23650629 : break;
3309 heikki.linnakangas 277 ECB :
278 : /*
279 : * Finish any incomplete splits we encounter along the way.
280 : */
3309 heikki.linnakangas 281 GIC 6097439 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
3309 heikki.linnakangas 282 UIC 0 : {
3309 heikki.linnakangas 283 LBC 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
3309 heikki.linnakangas 284 EUB :
285 : /* upgrade our lock if necessary */
3309 heikki.linnakangas 286 UIC 0 : if (access == BT_READ)
287 : {
992 pg 288 UBC 0 : _bt_unlockbuf(rel, buf);
992 pg 289 UIC 0 : _bt_lockbuf(rel, buf, BT_WRITE);
3309 heikki.linnakangas 290 EUB : }
291 :
3309 heikki.linnakangas 292 UIC 0 : if (P_INCOMPLETE_SPLIT(opaque))
8 andres 293 UNC 0 : _bt_finish_split(rel, heaprel, buf, stack);
3309 heikki.linnakangas 294 EUB : else
3309 heikki.linnakangas 295 UBC 0 : _bt_relbuf(rel, buf);
296 :
3309 heikki.linnakangas 297 EUB : /* re-acquire the lock in the right mode, and re-check */
8 andres 298 UNC 0 : buf = _bt_getbuf(rel, heaprel, blkno, access);
3309 heikki.linnakangas 299 UIC 0 : continue;
3309 heikki.linnakangas 300 EUB : }
301 :
1481 pg 302 GIC 6097439 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
303 : {
3309 heikki.linnakangas 304 ECB : /* step right one page */
3309 heikki.linnakangas 305 GIC 676 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
306 676 : continue;
3309 heikki.linnakangas 307 ECB : }
308 : else
309 : break;
310 : }
311 :
7351 tgl 312 GIC 29747392 : if (P_IGNORE(opaque))
5578 tgl 313 UIC 0 : elog(ERROR, "fell off the end of index \"%s\"",
7351 tgl 314 ECB : RelationGetRelationName(rel));
7351 tgl 315 EUB :
8986 bruce 316 GIC 29747392 : return buf;
317 : }
9770 scrappy 318 ECB :
319 : /*
320 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
321 : *
322 : * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
323 : * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
324 : * particular, this means it is possible to return a value 1 greater than the
325 : * number of keys on the page, if the scankey is > all keys on the page.)
326 : *
327 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
328 : * of the last key < given scankey, or last key <= given scankey if nextkey
329 : * is true. (Since _bt_compare treats the first data key of such a page as
330 : * minus infinity, there will be at least one key < scankey, so the result
331 : * always points at one of the keys on the page.) This key indicates the
332 : * right place to descend to be sure we find all leaf keys >= given scankey
333 : * (or leaf keys > given scankey when nextkey is true).
334 : *
335 : * This procedure is not responsible for walking right, it just examines
336 : * the given page. _bt_binsrch() has no lock or refcount side effects
337 : * on the buffer.
338 : */
339 : static OffsetNumber
9770 scrappy 340 GIC 20943015 : _bt_binsrch(Relation rel,
341 : BTScanInsert key,
1481 pg 342 ECB : Buffer buf)
343 : {
344 : Page page;
345 : BTPageOpaque opaque;
346 : OffsetNumber low,
347 : high;
348 : int32 result,
349 : cmpval;
350 :
2545 kgrittn 351 GIC 20943015 : page = BufferGetPage(buf);
373 michael 352 20943015 : opaque = BTPageGetOpaque(page);
9345 bruce 353 ECB :
1308 pg 354 : /* Requesting nextkey semantics while using scantid seems nonsensical */
1308 pg 355 GIC 20943015 : Assert(!key->nextkey || key->scantid == NULL);
356 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
1308 pg 357 CBC 20943015 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
358 :
8297 tgl 359 20943015 : low = P_FIRSTDATAKEY(opaque);
9345 bruce 360 GIC 20943015 : high = PageGetMaxOffsetNumber(page);
9345 bruce 361 ECB :
362 : /*
363 : * If there are no keys on the page, return the first available slot. Note
364 : * this covers two cases: the page is really empty (no keys), or it
365 : * contains only a high key. The latter case is possible after vacuuming.
366 : * This can never happen on an internal page, however, since they are
367 : * never empty (an internal page must have children).
368 : */
1481 pg 369 GIC 20943015 : if (unlikely(high < low))
8986 bruce 370 2170 : return low;
9345 bruce 371 ECB :
8668 tgl 372 : /*
373 : * Binary search to find the first key on the page >= scan key, or first
374 : * key > scankey when nextkey is true.
375 : *
376 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
378 : *
379 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
381 : *
382 : * We can fall out when high == low.
383 : */
8668 tgl 384 GIC 20940845 : high++; /* establish the loop invariant for high */
385 :
1481 pg 386 CBC 20940845 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
387 :
8668 tgl 388 127472718 : while (high > low)
389 : {
8397 bruce 390 106531873 : OffsetNumber mid = low + ((high - low) / 2);
391 :
8668 tgl 392 ECB : /* We have low <= mid < high, so mid points at a real slot */
393 :
1481 pg 394 GIC 106531873 : result = _bt_compare(rel, key, page, mid);
395 :
7049 tgl 396 CBC 106531873 : if (result >= cmpval)
8668 tgl 397 GIC 70298342 : low = mid + 1;
9345 bruce 398 ECB : else
8668 tgl 399 CBC 36233531 : high = mid;
400 : }
9345 bruce 401 ECB :
402 : /*
403 : * At this point we have high == low, but be careful: they could point
404 : * past the last slot on the page.
405 : *
406 : * On a leaf page, we always return the first key >= scan key (resp. >
407 : * scan key), which could be the last slot + 1.
408 : */
8297 tgl 409 GIC 20940845 : if (P_ISLEAF(opaque))
8668 410 8666182 : return low;
9345 bruce 411 ECB :
8053 412 : /*
413 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
414 : * There must be one if _bt_compare() is playing by the rules.
415 : */
8297 tgl 416 GIC 12274663 : Assert(low > P_FIRSTDATAKEY(opaque));
417 :
8668 tgl 418 CBC 12274663 : return OffsetNumberPrev(low);
419 : }
9770 scrappy 420 ECB :
421 : /*
422 : *
423 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
424 : *
425 : * Like _bt_binsrch(), but with support for caching the binary search
426 : * bounds. Only used during insertion, and only on the leaf page that it
427 : * looks like caller will insert tuple on. Exclusive-locked and pinned
428 : * leaf page is contained within insertstate.
429 : *
430 : * Caches the bounds fields in insertstate so that a subsequent call can
431 : * reuse the low and strict high bounds of original binary search. Callers
432 : * that use these fields directly must be prepared for the case where low
433 : * and/or stricthigh are not on the same page (one or both exceed maxoff
434 : * for the page). The case where there are no items on the page (high <
435 : * low) makes bounds invalid.
436 : *
437 : * Caller is responsible for invalidating bounds when it modifies the page
438 : * before calling here a second time, and for dealing with posting list
439 : * tuple matches (callers can use insertstate's postingoff field to
440 : * determine which existing heap TID will need to be replaced by a posting
441 : * list split).
442 : */
443 : OffsetNumber
1481 pg 444 GIC 12616278 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
445 : {
1481 pg 446 CBC 12616278 : BTScanInsert key = insertstate->itup_key;
447 : Page page;
1481 pg 448 ECB : BTPageOpaque opaque;
449 : OffsetNumber low,
450 : high,
451 : stricthigh;
452 : int32 result,
453 : cmpval;
454 :
1481 pg 455 GIC 12616278 : page = BufferGetPage(insertstate->buf);
373 michael 456 12616278 : opaque = BTPageGetOpaque(page);
1481 pg 457 ECB :
1481 pg 458 CBC 12616278 : Assert(P_ISLEAF(opaque));
1481 pg 459 GIC 12616278 : Assert(!key->nextkey);
1138 pg 460 CBC 12616278 : Assert(insertstate->postingoff == 0);
1481 pg 461 ECB :
1481 pg 462 CBC 12616278 : if (!insertstate->bounds_valid)
463 : {
1481 pg 464 ECB : /* Start new binary search */
1481 pg 465 GIC 7384948 : low = P_FIRSTDATAKEY(opaque);
466 7384948 : high = PageGetMaxOffsetNumber(page);
1481 pg 467 ECB : }
468 : else
469 : {
470 : /* Restore result of previous binary search against same page */
1481 pg 471 GIC 5231330 : low = insertstate->low;
472 5231330 : high = insertstate->stricthigh;
1481 pg 473 ECB : }
474 :
475 : /* If there are no keys on the page, return the first available slot */
1481 pg 476 GIC 12616278 : if (unlikely(high < low))
477 : {
1481 pg 478 ECB : /* Caller can't reuse bounds */
1481 pg 479 GIC 19103 : insertstate->low = InvalidOffsetNumber;
480 19103 : insertstate->stricthigh = InvalidOffsetNumber;
1481 pg 481 CBC 19103 : insertstate->bounds_valid = false;
482 19103 : return low;
1481 pg 483 ECB : }
484 :
485 : /*
486 : * Binary search to find the first key on the page >= scan key. (nextkey
487 : * is always false when inserting).
488 : *
489 : * The loop invariant is: all slots before 'low' are < scan key, all slots
490 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491 : * maintained to save additional search effort for caller.
492 : *
493 : * We can fall out when high == low.
494 : */
1481 pg 495 GIC 12597175 : if (!insertstate->bounds_valid)
496 7365845 : high++; /* establish the loop invariant for high */
1481 pg 497 CBC 12597175 : stricthigh = high; /* high initially strictly higher */
1481 pg 498 ECB :
1481 pg 499 CBC 12597175 : cmpval = 1; /* !nextkey comparison value */
500 :
501 65283644 : while (high > low)
502 : {
503 52686469 : OffsetNumber mid = low + ((high - low) / 2);
504 :
1481 pg 505 ECB : /* We have low <= mid < high, so mid points at a real slot */
506 :
1481 pg 507 GIC 52686469 : result = _bt_compare(rel, key, page, mid);
508 :
1481 pg 509 CBC 52686469 : if (result >= cmpval)
1481 pg 510 GIC 40616979 : low = mid + 1;
1481 pg 511 ECB : else
512 : {
1481 pg 513 GIC 12069490 : high = mid;
514 12069490 : if (result != 0)
1481 pg 515 CBC 11435404 : stricthigh = high;
1481 pg 516 ECB : }
1138 517 :
518 : /*
519 : * If tuple at offset located by binary search is a posting list whose
520 : * TID range overlaps with caller's scantid, perform posting list
521 : * binary search to set postingoff for caller. Caller must split the
522 : * posting list when postingoff is set. This should happen
523 : * infrequently.
524 : */
1138 pg 525 GIC 52686469 : if (unlikely(result == 0 && key->scantid != NULL))
526 : {
529 pg 527 ECB : /*
528 : * postingoff should never be set more than once per leaf page
529 : * binary search. That would mean that there are duplicate table
530 : * TIDs in the index, which is never okay. Check for that here.
531 : */
529 pg 532 GIC 207807 : if (insertstate->postingoff != 0)
529 pg 533 UIC 0 : ereport(ERROR,
529 pg 534 ECB : (errcode(ERRCODE_INDEX_CORRUPTED),
529 pg 535 EUB : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
536 : ItemPointerGetBlockNumber(key->scantid),
537 : ItemPointerGetOffsetNumber(key->scantid),
538 : low, stricthigh,
539 : BufferGetBlockNumber(insertstate->buf),
540 : RelationGetRelationName(rel))));
541 :
1138 pg 542 GIC 207807 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
543 : }
1481 pg 544 ECB : }
545 :
546 : /*
547 : * On a leaf page, a binary search always returns the first key >= scan
548 : * key (at least in !nextkey case), which could be the last slot + 1. This
549 : * is also the lower bound of cached search.
550 : *
551 : * stricthigh may also be the last slot + 1, which prevents caller from
552 : * using bounds directly, but is still useful to us if we're called a
553 : * second time with cached bounds (cached low will be < stricthigh when
554 : * that happens).
555 : */
1481 pg 556 GIC 12597175 : insertstate->low = low;
557 12597175 : insertstate->stricthigh = stricthigh;
1481 pg 558 CBC 12597175 : insertstate->bounds_valid = true;
1481 pg 559 ECB :
1481 pg 560 CBC 12597175 : return low;
561 : }
1481 pg 562 ECB :
563 : /*----------
564 : * _bt_binsrch_posting() -- posting list binary search.
565 : *
566 : * Helper routine for _bt_binsrch_insert().
567 : *
568 : * Returns offset into posting list where caller's scantid belongs.
569 : *----------
570 : */
571 : static int
1138 pg 572 GIC 207807 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
573 : {
1138 pg 574 ECB : IndexTuple itup;
575 : ItemId itemid;
576 : int low,
577 : high,
578 : mid,
579 : res;
580 :
581 : /*
582 : * If this isn't a posting tuple, then the index must be corrupt (if it is
583 : * an ordinary non-pivot tuple then there must be an existing tuple with a
584 : * heap TID that equals inserter's new heap TID/scantid). Defensively
585 : * check that tuple is a posting list tuple whose posting list range
586 : * includes caller's scantid.
587 : *
588 : * (This is also needed because contrib/amcheck's rootdescend option needs
589 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
590 : */
1138 pg 591 GIC 207807 : itemid = PageGetItemId(page, offnum);
592 207807 : itup = (IndexTuple) PageGetItem(page, itemid);
1138 pg 593 CBC 207807 : if (!BTreeTupleIsPosting(itup))
594 200041 : return 0;
1138 pg 595 ECB :
1138 pg 596 CBC 7766 : Assert(key->heapkeyspace && key->allequalimage);
597 :
1138 pg 598 ECB : /*
599 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
600 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
601 : * second call to _bt_binsrch_insert() can take place when its caller has
602 : * removed the dead item.
603 : */
1138 pg 604 GIC 7766 : if (ItemIdIsDead(itemid))
1138 pg 605 UIC 0 : return -1;
1138 pg 606 ECB :
1138 pg 607 EUB : /* "high" is past end of posting list for loop invariant */
1138 pg 608 GIC 7766 : low = 0;
609 7766 : high = BTreeTupleGetNPosting(itup);
1138 pg 610 CBC 7766 : Assert(high >= 2);
1138 pg 611 ECB :
1138 pg 612 CBC 62476 : while (high > low)
613 : {
614 54710 : mid = low + ((high - low) / 2);
1138 pg 615 GIC 54710 : res = ItemPointerCompare(key->scantid,
1138 pg 616 ECB : BTreeTupleGetPostingN(itup, mid));
617 :
1138 pg 618 GIC 54710 : if (res > 0)
619 28848 : low = mid + 1;
1138 pg 620 CBC 25862 : else if (res < 0)
621 25862 : high = mid;
1138 pg 622 ECB : else
1138 pg 623 LBC 0 : return mid;
624 : }
1138 pg 625 EUB :
626 : /* Exact match not found */
1138 pg 627 GIC 7766 : return low;
628 : }
1138 pg 629 ECB :
630 : /*----------
631 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
632 : *
633 : * page/offnum: location of btree item to be compared to.
634 : *
635 : * This routine returns:
636 : * <0 if scankey < tuple at offnum;
637 : * 0 if scankey == tuple at offnum;
638 : * >0 if scankey > tuple at offnum.
639 : *
640 : * NULLs in the keys are treated as sortable values. Therefore
641 : * "equality" does not necessarily mean that the item should be returned
642 : * to the caller as a matching key. Similarly, an insertion scankey
643 : * with its scantid set is treated as equal to a posting tuple whose TID
644 : * range overlaps with their scantid. There generally won't be a
645 : * matching TID in the posting tuple, which caller must handle
646 : * themselves (e.g., by splitting the posting list tuple).
647 : *
648 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
649 : * "minus infinity": this routine will always claim it is less than the
650 : * scankey. The actual key value stored is explicitly truncated to 0
651 : * attributes (explicitly minus infinity) with version 3+ indexes, but
652 : * that isn't relied upon. This allows us to implement the Lehman and
653 : * Yao convention that the first down-link pointer is before the first
654 : * key. See backend/access/nbtree/README for details.
655 : *----------
656 : */
657 : int32
9770 scrappy 658 GIC 177440905 : _bt_compare(Relation rel,
659 : BTScanInsert key,
8297 tgl 660 ECB : Page page,
661 : OffsetNumber offnum)
662 : {
8297 tgl 663 GIC 177440905 : TupleDesc itupdesc = RelationGetDescr(rel);
373 michael 664 177440905 : BTPageOpaque opaque = BTPageGetOpaque(page);
9344 bruce 665 ECB : IndexTuple itup;
1481 pg 666 : ItemPointer heapTid;
667 : ScanKey scankey;
668 : int ncmpkey;
669 : int ntupatts;
670 : int32 result;
671 :
1481 pg 672 GIC 177440905 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
673 177440905 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
1481 pg 674 CBC 177440905 : Assert(key->heapkeyspace || key->scantid == NULL);
1828 teodor 675 ECB :
9770 scrappy 676 : /*
677 : * Force result ">" if target item is first data item on an internal page
678 : * --- see NOTE above.
679 : */
8053 bruce 680 GIC 177440905 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
8986 681 1889977 : return 1;
9345 bruce 682 ECB :
6283 tgl 683 CBC 175550928 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1481 pg 684 GIC 175550928 : ntupatts = BTreeTupleGetNAtts(itup, rel);
9345 bruce 685 ECB :
9770 scrappy 686 : /*
687 : * The scan key is set up with the attribute number associated with each
688 : * term in the key. It is important that, if the index is multi-key, the
689 : * scan contain the first k key attributes, and that they be in order. If
690 : * you think about how multi-key ordering works, you'll understand why
691 : * this is.
692 : *
693 : * We don't test for violation of this condition here, however. The
694 : * initial setup for the index scan had better have gotten it right (see
695 : * _bt_first).
696 : */
697 :
1481 pg 698 GIC 175550928 : ncmpkey = Min(ntupatts, key->keysz);
699 175550928 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
1138 pg 700 CBC 175550928 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
1481 701 175550928 : scankey = key->scankeys;
702 220720244 : for (int i = 1; i <= ncmpkey; i++)
9512 vadim4o 703 ECB : {
8297 tgl 704 : Datum datum;
705 : bool isNull;
706 :
7088 tgl 707 GIC 207864745 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
708 :
2118 tgl 709 CBC 207864745 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
710 : {
8297 711 170918 : if (isNull)
8349 tgl 712 GIC 83 : result = 0; /* NULL "=" NULL */
5934 tgl 713 CBC 170835 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
714 132 : result = -1; /* NULL "<" NOT_NULL */
9345 bruce 715 ECB : else
8349 tgl 716 CBC 170703 : result = 1; /* NULL ">" NOT_NULL */
717 : }
8297 718 207693827 : else if (isNull) /* key is NOT_NULL and item is NULL */
719 : {
5934 720 132 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
5934 tgl 721 UIC 0 : result = 1; /* NOT_NULL ">" NULL */
5934 tgl 722 ECB : else
5934 tgl 723 GBC 132 : result = -1; /* NOT_NULL "<" NULL */
724 : }
9345 bruce 725 ECB : else
726 : {
727 : /*
728 : * The sk_func needs to be passed the index value as left arg and
729 : * the sk_argument as right arg (they might be of different
730 : * types). Since it is convenient for callers to think of
731 : * _bt_compare as comparing the scankey to the index item, we have
732 : * to flip the sign of the comparison result. (Unless it's a DESC
733 : * column, in which case we *don't* flip the sign.)
734 : */
4380 tgl 735 GIC 207693695 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
736 : scankey->sk_collation,
4380 tgl 737 ECB : datum,
738 : scankey->sk_argument));
739 :
5934 tgl 740 GIC 207693695 : if (!(scankey->sk_flags & SK_BT_DESC))
1647 741 207693692 : INVERT_COMPARE_RESULT(result);
8297 tgl 742 ECB : }
9345 bruce 743 :
744 : /* if the keys are unequal, return the difference */
9345 bruce 745 GIC 207864745 : if (result != 0)
8986 746 162695429 : return result;
7088 tgl 747 ECB :
7088 tgl 748 CBC 45169316 : scankey++;
749 : }
9345 bruce 750 ECB :
751 : /*
752 : * All non-truncated attributes (other than heap TID) were found to be
753 : * equal. Treat truncated attributes as minus infinity when scankey has a
754 : * key attribute value that would otherwise be compared directly.
755 : *
756 : * Note: it doesn't matter if ntupatts includes non-key attributes;
757 : * scankey won't, so explicitly excluding non-key attributes isn't
758 : * necessary.
759 : */
1481 pg 760 GIC 12855499 : if (key->keysz > ntupatts)
761 125172 : return 1;
1481 pg 762 ECB :
763 : /*
764 : * Use the heap TID attribute and scantid to try to break the tie. The
765 : * rules are the same as any other key attribute -- only the
766 : * representation differs.
767 : */
1481 pg 768 GIC 12730327 : heapTid = BTreeTupleGetHeapTID(itup);
769 12730327 : if (key->scantid == NULL)
1481 pg 770 ECB : {
771 : /*
772 : * Most searches have a scankey that is considered greater than a
773 : * truncated pivot tuple if and when the scankey has equal values for
774 : * attributes up to and including the least significant untruncated
775 : * attribute in tuple.
776 : *
777 : * For example, if an index has the minimum two attributes (single
778 : * user key attribute, plus heap TID attribute), and a page's high key
779 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
780 : * will not descend to the page to the left. The search will descend
781 : * right instead. The truncated attribute in pivot tuple means that
782 : * all non-pivot tuples on the page to the left are strictly < 'foo',
783 : * so it isn't necessary to descend left. In other words, search
784 : * doesn't have to descend left because it isn't interested in a match
785 : * that has a heap TID value of -inf.
786 : *
787 : * However, some searches (pivotsearch searches) actually require that
788 : * we descend left when this happens. -inf is treated as a possible
789 : * match for omitted scankey attribute(s). This is needed by page
790 : * deletion, which must re-find leaf pages that are targets for
791 : * deletion using their high keys.
792 : *
793 : * Note: the heap TID part of the test ensures that scankey is being
794 : * compared to a pivot tuple with one or more truncated key
795 : * attributes.
796 : *
797 : * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
798 : * left here, since they have no heap TID attribute (and cannot have
799 : * any -inf key values in any case, since truncation can only remove
800 : * non-key attributes). !heapkeyspace searches must always be
801 : * prepared to deal with matches on both sides of the pivot once the
802 : * leaf level is reached.
803 : */
1481 pg 804 GIC 9213987 : if (key->heapkeyspace && !key->pivotsearch &&
805 9207253 : key->keysz == ntupatts && heapTid == NULL)
1481 pg 806 CBC 7404 : return 1;
1481 pg 807 ECB :
808 : /* All provided scankey arguments found to be equal */
1481 pg 809 GIC 9206583 : return 0;
810 : }
1481 pg 811 ECB :
812 : /*
813 : * Treat truncated heap TID as minus infinity, since scankey has a key
814 : * attribute value (scantid) that would otherwise be compared directly
815 : */
1481 pg 816 GIC 3516340 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
817 3516340 : if (heapTid == NULL)
1481 pg 818 CBC 1978 : return 1;
1481 pg 819 ECB :
1138 820 : /*
821 : * Scankey must be treated as equal to a posting list tuple if its scantid
822 : * value falls within the range of the posting list. In all other cases
823 : * there can only be a single heap TID value, which is compared directly
824 : * with scantid.
825 : */
1481 pg 826 GIC 3514362 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
1138 827 3514362 : result = ItemPointerCompare(key->scantid, heapTid);
1138 pg 828 CBC 3513995 : if (result <= 0 || !BTreeTupleIsPosting(itup))
829 3394046 : return result;
1138 pg 830 ECB : else
831 : {
1138 pg 832 GIC 119949 : result = ItemPointerCompare(key->scantid,
833 : BTreeTupleGetMaxHeapTID(itup));
1138 pg 834 CBC 119949 : if (result > 0)
1138 pg 835 GIC 112183 : return 1;
1138 pg 836 ECB : }
837 :
1138 pg 838 GIC 7766 : return 0;
839 : }
9770 scrappy 840 ECB :
841 : /*
842 : * _bt_first() -- Find the first item in a scan.
843 : *
844 : * We need to be clever about the direction of scan, the search
845 : * conditions, and the tree ordering. We find the first item (or,
846 : * if backwards scan, the last item) in the tree that satisfies the
847 : * qualifications in the scan key. On success exit, the page containing
848 : * the current index tuple is pinned but not locked, and data about
849 : * the matching tuple(s) on the page has been loaded into so->currPos.
850 : * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
851 : * and if requested, scan->xs_itup points to a copy of the index tuple.
852 : *
853 : * If there are no matching items in the index, we return false, with no
854 : * pins or locks held.
855 : *
856 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
857 : * are both search-type scankeys (see nbtree/README for more about this).
858 : * Within this routine, we build a temporary insertion-type scankey to use
859 : * in locating the scan start position.
860 : */
861 : bool
9770 scrappy 862 GIC 9087979 : _bt_first(IndexScanDesc scan, ScanDirection dir)
863 : {
7625 tgl 864 CBC 9087979 : Relation rel = scan->indexRelation;
8 andres 865 GNC 9087979 : Relation heaprel = scan->heapRelation;
7625 tgl 866 GIC 9087979 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
9344 bruce 867 ECB : Buffer buf;
868 : BTStack stack;
8297 tgl 869 : OffsetNumber offnum;
870 : StrategyNumber strat;
871 : bool nextkey;
872 : bool goback;
873 : BTScanInsertData inskey;
874 : ScanKey startKeys[INDEX_MAX_KEYS];
875 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
8397 bruce 876 GIC 9087979 : int keysCount = 0;
877 : int i;
878 : bool status;
8397 bruce 879 ECB : StrategyNumber strat_total;
880 : BTScanPosItem *currItem;
881 : BlockNumber blkno;
882 :
2937 kgrittn 883 GIC 9087979 : Assert(!BTScanPosIsValid(so->currPos));
884 :
5796 tgl 885 9087979 : pgstat_count_index_scan(rel);
6394 tgl 886 ECB :
887 : /*
6285 888 : * Examine the scan keys and eliminate any redundant keys; also mark the
889 : * keys that must be matched to continue the scan.
890 : */
7088 tgl 891 GIC 9087979 : _bt_preprocess_keys(scan);
892 :
893 : /*
7088 tgl 894 ECB : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
895 : * never be satisfied (eg, x == 1 AND x > 2).
896 : */
8053 bruce 897 GIC 9087979 : if (!so->qual_ok)
898 : {
899 : /* Notify any other workers that we're done with this scan key. */
934 akapila 900 CBC 1286 : _bt_parallel_done(scan);
7629 tgl 901 GIC 1286 : return false;
902 : }
8293 tgl 903 ECB :
2244 rhaas 904 : /*
905 : * For parallel scans, get the starting page from shared state. If the
906 : * scan has not started, proceed to find out first leaf page in the usual
907 : * way while keeping other participating processes waiting. If the scan
908 : * has already begun, use the page number from the shared structure.
909 : */
2244 rhaas 910 GIC 9086693 : if (scan->parallel_scan != NULL)
911 : {
912 184 : status = _bt_parallel_seize(scan, &blkno);
2244 rhaas 913 CBC 184 : if (!status)
2244 rhaas 914 GIC 140 : return false;
2244 rhaas 915 CBC 44 : else if (blkno == P_NONE)
2244 rhaas 916 ECB : {
2244 rhaas 917 CBC 2 : _bt_parallel_done(scan);
918 2 : return false;
919 : }
920 42 : else if (blkno != InvalidBlockNumber)
2244 rhaas 921 ECB : {
2244 rhaas 922 GIC 4 : if (!_bt_parallel_readpage(scan, blkno, dir))
2244 rhaas 923 LBC 0 : return false;
2244 rhaas 924 GIC 4 : goto readcomplete;
2244 rhaas 925 ECB : }
2244 rhaas 926 EUB : }
2244 rhaas 927 ECB :
928 : /*----------
929 : * Examine the scan keys to discover where we need to start the scan.
930 : *
931 : * We want to identify the keys that can be used as starting boundaries;
932 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
933 : * a backwards scan. We can use keys for multiple attributes so long as
934 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
935 : * a > or < boundary or find an attribute with no boundary (which can be
936 : * thought of as the same as "> -infinity"), we can't use keys for any
937 : * attributes to its right, because it would break our simplistic notion
938 : * of what initial positioning strategy to use.
939 : *
940 : * When the scan keys include cross-type operators, _bt_preprocess_keys
941 : * may not be able to eliminate redundant keys; in such cases we will
942 : * arbitrarily pick a usable one for each attribute. This is correct
943 : * but possibly not optimal behavior. (For example, with keys like
944 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
945 : * x=5 would be more efficient.) Since the situation only arises given
946 : * a poorly-worded query plus an incomplete opfamily, live with it.
947 : *
948 : * When both equality and inequality keys appear for a single attribute
949 : * (again, only possible when cross-type operators appear), we *must*
950 : * select one of the equality keys for the starting point, because
951 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
952 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
953 : * start at x=4, we will fail and stop before reaching x=10. If multiple
954 : * equality quals survive preprocessing, however, it doesn't matter which
955 : * one we use --- by definition, they are either redundant or
956 : * contradictory.
957 : *
958 : * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
959 : * If the index stores nulls at the end of the index we'll be starting
960 : * from, and we have no boundary key for the column (which means the key
961 : * we deduced NOT NULL from is an inequality key that constrains the other
962 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
963 : * use as a boundary key. If we didn't do this, we might find ourselves
964 : * traversing a lot of null entries at the start of the scan.
965 : *
966 : * In this loop, row-comparison keys are treated the same as keys on their
967 : * first (leftmost) columns. We'll add on lower-order columns of the row
968 : * comparison below, if possible.
969 : *
970 : * The selected scan keys (at most one per index column) are remembered by
971 : * storing their addresses into the local startKeys[] array.
972 : *----------
973 : */
8595 bruce 974 GIC 9086547 : strat_total = BTEqualStrategyNumber;
8293 tgl 975 9086547 : if (so->numberOfKeys > 0)
976 : {
7088 tgl 977 ECB : AttrNumber curattr;
978 : ScanKey chosen;
979 : ScanKey impliesNN;
980 : ScanKey cur;
981 :
982 : /*
983 : * chosen is the so-far-chosen key for the current attribute, if any.
984 : * We don't cast the decision in stone until we reach keys for the
985 : * next attribute.
986 : */
7088 tgl 987 GIC 9081689 : curattr = 1;
988 9081689 : chosen = NULL;
989 : /* Also remember any scankey that implies a NOT NULL constraint */
4176 tgl 990 CBC 9081689 : impliesNN = NULL;
6797 bruce 991 ECB :
992 : /*
7088 tgl 993 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
994 : * pass to handle after-last-key processing. Actual exit from the
995 : * loop is at one of the "break" statements below.
996 : */
7088 tgl 997 GIC 24989949 : for (cur = so->keyData, i = 0;; cur++, i++)
998 : {
999 24989949 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
8595 bruce 1000 ECB : {
1001 : /*
7088 tgl 1002 : * Done looking at keys for curattr. If we didn't find a
1003 : * usable boundary key, see if we can deduce a NOT NULL key.
1004 : */
4176 tgl 1005 GIC 15934530 : if (chosen == NULL && impliesNN != NULL &&
1006 25904 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1007 : ScanDirectionIsForward(dir) :
4176 tgl 1008 ECB : ScanDirectionIsBackward(dir)))
1009 : {
1010 : /* Yes, so build the key in notnullkeys[keysCount] */
4176 tgl 1011 GIC 3 : chosen = ¬nullkeys[keysCount];
1012 3 : ScanKeyEntryInitialize(chosen,
1013 : (SK_SEARCHNOTNULL | SK_ISNULL |
4176 tgl 1014 CBC 3 : (impliesNN->sk_flags &
2118 tgl 1015 ECB : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1016 : curattr,
2118 tgl 1017 CBC 3 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1018 : BTGreaterStrategyNumber :
1019 : BTLessStrategyNumber),
4176 tgl 1020 ECB : InvalidOid,
1021 : InvalidOid,
1022 : InvalidOid,
1023 : (Datum) 0);
1024 : }
1025 :
1026 : /*
1027 : * If we still didn't find a usable boundary key, quit; else
1028 : * save the boundary key pointer in startKeys.
1029 : */
7088 tgl 1030 GIC 15908626 : if (chosen == NULL)
1031 27175 : break;
1032 15881451 : startKeys[keysCount++] = chosen;
6797 bruce 1033 ECB :
7088 tgl 1034 : /*
6797 bruce 1035 : * Adjust strat_total, and quit if we have stored a > or <
1036 : * key.
1037 : */
7088 tgl 1038 GIC 15881451 : strat = chosen->sk_strategy;
1039 15881451 : if (strat != BTEqualStrategyNumber)
1040 : {
7088 tgl 1041 CBC 709058 : strat_total = strat;
1042 709058 : if (strat == BTGreaterStrategyNumber ||
1043 : strat == BTLessStrategyNumber)
7088 tgl 1044 ECB : break;
1045 : }
1046 :
1047 : /*
1048 : * Done if that was the last attribute, or if next key is not
1049 : * in sequence (implying no boundary key is available for the
1050 : * next attribute).
1051 : */
6509 tgl 1052 GIC 15174487 : if (i >= so->numberOfKeys ||
1053 6827667 : cur->sk_attno != curattr + 1)
1054 : break;
6797 bruce 1055 ECB :
7088 tgl 1056 : /*
1057 : * Reset for next attr.
1058 : */
7088 tgl 1059 GIC 6826937 : curattr = cur->sk_attno;
1060 6826937 : chosen = NULL;
4176 1061 6826937 : impliesNN = NULL;
8595 bruce 1062 ECB : }
7088 tgl 1063 :
4176 1064 : /*
1065 : * Can we use this key as a starting boundary for this attr?
1066 : *
1067 : * If not, does it imply a NOT NULL constraint? (Because
1068 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1069 : * *any* inequality key works for that; we need not test.)
1070 : */
7088 tgl 1071 GIC 15908260 : switch (cur->sk_strategy)
1072 : {
1073 48301 : case BTLessStrategyNumber:
7088 tgl 1074 ECB : case BTLessEqualStrategyNumber:
4176 tgl 1075 GIC 48301 : if (chosen == NULL)
4176 tgl 1076 ECB : {
4176 tgl 1077 GIC 47399 : if (ScanDirectionIsBackward(dir))
4176 tgl 1078 CBC 21504 : chosen = cur;
1079 : else
1080 25895 : impliesNN = cur;
4176 tgl 1081 ECB : }
7088 tgl 1082 GIC 48301 : break;
7088 tgl 1083 CBC 15172393 : case BTEqualStrategyNumber:
1084 : /* override any non-equality choice */
1085 15172393 : chosen = cur;
1086 15172393 : break;
7088 tgl 1087 GIC 687566 : case BTGreaterEqualStrategyNumber:
7088 tgl 1088 ECB : case BTGreaterStrategyNumber:
4176 tgl 1089 CBC 687566 : if (chosen == NULL)
4176 tgl 1090 ECB : {
4176 tgl 1091 GIC 687566 : if (ScanDirectionIsForward(dir))
4176 tgl 1092 CBC 687551 : chosen = cur;
1093 : else
1094 15 : impliesNN = cur;
4176 tgl 1095 ECB : }
8762 bruce 1096 GIC 687566 : break;
8762 bruce 1097 ECB : }
1098 : }
9345 1099 : }
1100 :
1101 : /*
1102 : * If we found no usable boundary keys, we have to start from one end of
1103 : * the tree. Walk down that edge to the first or last key, and scan from
1104 : * there.
1105 : */
7088 tgl 1106 GIC 9086547 : if (keysCount == 0)
1107 : {
1108 : bool match;
2244 rhaas 1109 ECB :
2244 rhaas 1110 GIC 31957 : match = _bt_endpoint(scan, dir);
1111 :
1112 31957 : if (!match)
2244 rhaas 1113 ECB : {
1114 : /* No match, so mark (parallel) scan finished */
2244 rhaas 1115 CBC 3978 : _bt_parallel_done(scan);
1116 : }
1117 :
1118 31957 : return match;
1119 : }
1120 :
9345 bruce 1121 ECB : /*
1122 : * We want to start the scan somewhere within the index. Set up an
1123 : * insertion scankey we can use to search for the boundary point we
1124 : * identified above. The insertion scankey is built using the keys
1125 : * identified by startKeys[]. (Remaining insertion scankey fields are
1126 : * initialized after initial-positioning strategy is finalized.)
1127 : */
6503 tgl 1128 GIC 9054590 : Assert(keysCount <= INDEX_MAX_KEYS);
8397 bruce 1129 24936029 : for (i = 0; i < keysCount; i++)
1130 : {
7088 tgl 1131 CBC 15881451 : ScanKey cur = startKeys[i];
8053 bruce 1132 ECB :
6031 bruce 1133 GIC 15881451 : Assert(cur->sk_attno == i + 1);
6797 bruce 1134 ECB :
6283 tgl 1135 GIC 15881451 : if (cur->sk_flags & SK_ROW_HEADER)
7088 tgl 1136 ECB : {
1137 : /*
6283 1138 : * Row comparison header: look to the first row member instead.
1139 : *
1140 : * The member scankeys are already in insertion format (ie, they
1141 : * have sk_func = 3-way-comparison function), but we have to watch
1142 : * out for nulls, which _bt_preprocess_keys didn't check. A null
1143 : * in the first row member makes the condition unmatchable, just
1144 : * like qual_ok = false.
1145 : */
5934 tgl 1146 GIC 12 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1147 :
1148 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
5934 tgl 1149 CBC 12 : if (subkey->sk_flags & SK_ISNULL)
1150 : {
2244 rhaas 1151 LBC 0 : _bt_parallel_done(scan);
6283 tgl 1152 0 : return false;
1153 : }
1481 pg 1154 GBC 12 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
6031 bruce 1155 EUB :
1156 : /*
6283 tgl 1157 ECB : * If the row comparison is the last positioning key we accepted,
1158 : * try to add additional keys from the lower-order row members.
1159 : * (If we accepted independent conditions on additional index
1160 : * columns, we use those instead --- doesn't seem worth trying to
1161 : * determine which is more restrictive.) Note that this is OK
1162 : * even if the row comparison is of ">" or "<" type, because the
1163 : * condition applied to all but the last row member is effectively
1164 : * ">=" or "<=", and so the extra keys don't break the positioning
1165 : * scheme. But, by the same token, if we aren't able to use all
1166 : * the row members, then the part of the row comparison that we
1167 : * did use has to be treated as just a ">=" or "<=" condition, and
1168 : * so we'd better adjust strat_total accordingly.
1169 : */
6283 tgl 1170 GIC 12 : if (i == keysCount - 1)
1171 : {
5934 1172 12 : bool used_all_subkeys = false;
5934 tgl 1173 ECB :
5934 tgl 1174 GIC 12 : Assert(!(subkey->sk_flags & SK_ROW_END));
5624 bruce 1175 ECB : for (;;)
1176 : {
5934 tgl 1177 CBC 12 : subkey++;
5934 tgl 1178 GIC 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1179 12 : if (subkey->sk_attno != keysCount + 1)
6283 tgl 1180 LBC 0 : break; /* out-of-sequence, can't use it */
5934 tgl 1181 CBC 12 : if (subkey->sk_strategy != cur->sk_strategy)
5934 tgl 1182 LBC 0 : break; /* wrong direction, can't use it */
5934 tgl 1183 GBC 12 : if (subkey->sk_flags & SK_ISNULL)
6283 tgl 1184 LBC 0 : break; /* can't use null keys */
6283 tgl 1185 GBC 12 : Assert(keysCount < INDEX_MAX_KEYS);
1481 pg 1186 CBC 12 : memcpy(inskey.scankeys + keysCount, subkey,
1481 pg 1187 EUB : sizeof(ScanKeyData));
6283 tgl 1188 CBC 12 : keysCount++;
5934 1189 12 : if (subkey->sk_flags & SK_ROW_END)
1190 : {
1191 12 : used_all_subkeys = true;
1192 12 : break;
1193 : }
5934 tgl 1194 ECB : }
5934 tgl 1195 CBC 12 : if (!used_all_subkeys)
1196 : {
5934 tgl 1197 UIC 0 : switch (strat_total)
5934 tgl 1198 ECB : {
5934 tgl 1199 UIC 0 : case BTLessStrategyNumber:
5934 tgl 1200 UBC 0 : strat_total = BTLessEqualStrategyNumber;
5934 tgl 1201 UIC 0 : break;
5934 tgl 1202 UBC 0 : case BTGreaterStrategyNumber:
1203 0 : strat_total = BTGreaterEqualStrategyNumber;
1204 0 : break;
5934 tgl 1205 EUB : }
6283 1206 : }
6283 tgl 1207 GBC 12 : break; /* done with outer loop */
1208 : }
1209 : }
7088 tgl 1210 ECB : else
1211 : {
1212 : /*
1213 : * Ordinary comparison key. Transform the search-style scan key
1214 : * to an insertion scan key by replacing the sk_func with the
1215 : * appropriate btree comparison function.
1216 : *
1217 : * If scankey operator is not a cross-type comparison, we can use
1218 : * the cached comparison function; otherwise gotta look it up in
1219 : * the catalogs. (That can't lead to infinite recursion, since no
1220 : * indexscan initiated by syscache lookup will use cross-data-type
1221 : * operators.)
1222 : *
1223 : * We support the convention that sk_subtype == InvalidOid means
1224 : * the opclass input type; this is a hack to simplify life for
1225 : * ScanKeyInit().
1226 : */
5951 tgl 1227 GIC 15881439 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
1228 15621462 : cur->sk_subtype == InvalidOid)
6283 1229 15875108 : {
6283 tgl 1230 ECB : FmgrInfo *procinfo;
1231 :
6283 tgl 1232 CBC 15875108 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1481 pg 1233 GIC 15875108 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1234 : cur->sk_flags,
6283 tgl 1235 CBC 15875108 : cur->sk_attno,
6283 tgl 1236 ECB : InvalidStrategy,
1237 : cur->sk_subtype,
4380 1238 : cur->sk_collation,
1239 : procinfo,
1240 : cur->sk_argument);
1241 : }
1242 : else
1243 : {
1244 : RegProcedure cmp_proc;
1245 :
5951 tgl 1246 GIC 6331 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1247 6331 : rel->rd_opcintype[i],
1248 : cur->sk_subtype,
5951 tgl 1249 ECB : BTORDER_PROC);
5951 tgl 1250 CBC 6331 : if (!RegProcedureIsValid(cmp_proc))
5951 tgl 1251 UIC 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1252 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
5951 tgl 1253 ECB : cur->sk_attno, RelationGetRelationName(rel));
1481 pg 1254 GBC 6331 : ScanKeyEntryInitialize(inskey.scankeys + i,
1255 : cur->sk_flags,
6283 tgl 1256 GIC 6331 : cur->sk_attno,
6283 tgl 1257 ECB : InvalidStrategy,
1258 : cur->sk_subtype,
4380 1259 : cur->sk_collation,
1260 : cmp_proc,
1261 : cur->sk_argument);
1262 : }
1263 : }
1264 : }
1265 :
1266 : /*----------
1267 : * Examine the selected initial-positioning strategy to determine exactly
1268 : * where we need to start the scan, and set flag variables to control the
1269 : * code below.
1270 : *
1271 : * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1272 : * item >= scan key. If nextkey = true, they will locate the first
1273 : * item > scan key.
1274 : *
1275 : * If goback = true, we will then step back one item, while if
1276 : * goback = false, we will start the scan on the located item.
1277 : *----------
1278 : */
7049 tgl 1279 GIC 9054590 : switch (strat_total)
1280 : {
1281 21507 : case BTLessStrategyNumber:
6797 bruce 1282 ECB :
1283 : /*
6385 1284 : * Find first item >= scankey, then back up one to arrive at last
1285 : * item < scankey. (Note: this positioning strategy is only used
1286 : * for a backward scan, so that is always the correct starting
1287 : * position.)
1288 : */
7049 tgl 1289 GIC 21507 : nextkey = false;
1290 21507 : goback = true;
1291 21507 : break;
7049 tgl 1292 ECB :
7049 tgl 1293 LBC 0 : case BTLessEqualStrategyNumber:
6797 bruce 1294 ECB :
1295 : /*
6385 bruce 1296 EUB : * Find first item > scankey, then back up one to arrive at last
1297 : * item <= scankey. (Note: this positioning strategy is only used
1298 : * for a backward scan, so that is always the correct starting
1299 : * position.)
1300 : */
7049 tgl 1301 UIC 0 : nextkey = true;
1302 0 : goback = true;
1303 0 : break;
7049 tgl 1304 EUB :
7049 tgl 1305 GBC 8345532 : case BTEqualStrategyNumber:
6797 bruce 1306 EUB :
1307 : /*
6385 bruce 1308 ECB : * If a backward scan was specified, need to start with last equal
1309 : * item not first one.
1310 : */
7049 tgl 1311 GIC 8345532 : if (ScanDirectionIsBackward(dir))
1312 : {
1313 : /*
6385 bruce 1314 ECB : * This is the same as the <= strategy. We will check at the
1315 : * end whether the found item is actually =.
1316 : */
7049 tgl 1317 GIC 64 : nextkey = true;
1318 64 : goback = true;
1319 : }
7049 tgl 1320 ECB : else
1321 : {
1322 : /*
1323 : * This is the same as the >= strategy. We will check at the
1324 : * end whether the found item is actually =.
1325 : */
7049 tgl 1326 GIC 8345468 : nextkey = false;
1327 8345468 : goback = false;
1328 : }
7049 tgl 1329 CBC 8345532 : break;
7049 tgl 1330 ECB :
7049 tgl 1331 GIC 2094 : case BTGreaterEqualStrategyNumber:
6797 bruce 1332 ECB :
1333 : /*
3260 1334 : * Find first item >= scankey. (This is only used for forward
1335 : * scans.)
1336 : */
7049 tgl 1337 GIC 2094 : nextkey = false;
1338 2094 : goback = false;
1339 2094 : break;
7049 tgl 1340 ECB :
7049 tgl 1341 CBC 685457 : case BTGreaterStrategyNumber:
6797 bruce 1342 ECB :
1343 : /*
1344 : * Find first item > scankey. (This is only used for forward
1345 : * scans.)
1346 : */
7049 tgl 1347 GIC 685457 : nextkey = true;
1348 685457 : goback = false;
1349 685457 : break;
7049 tgl 1350 ECB :
7049 tgl 1351 LBC 0 : default:
7049 tgl 1352 ECB : /* can't get here, but keep compiler quiet */
7049 tgl 1353 UIC 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
7049 tgl 1354 EUB : return false;
1355 : }
1356 :
1357 : /* Initialize remaining insertion scan key fields */
8 andres 1358 GNC 9054590 : _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage);
1418 tgl 1359 GIC 9054590 : inskey.anynullkeys = false; /* unused */
1481 pg 1360 9054590 : inskey.nextkey = nextkey;
1481 pg 1361 CBC 9054590 : inskey.pivotsearch = false;
1362 9054590 : inskey.scantid = NULL;
1363 9054590 : inskey.keysz = keysCount;
1481 pg 1364 ECB :
9345 bruce 1365 : /*
6291 tgl 1366 : * Use the manufactured insertion scan key to descend the tree and
1367 : * position ourselves on the target leaf page.
1368 : */
8 andres 1369 GNC 9054590 : stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1370 :
1371 : /* don't need to keep the stack around... */
8297 tgl 1372 CBC 9054590 : _bt_freestack(stack);
1373 :
8053 bruce 1374 GIC 9054590 : if (!BufferIsValid(buf))
9345 bruce 1375 ECB : {
1376 : /*
4315 heikki.linnakangas 1377 : * We only get here if the index is completely empty. Lock relation
1378 : * because nothing finer to lock exists.
1379 : */
4316 heikki.linnakangas 1380 GIC 385759 : PredicateLockRelation(rel, scan->xs_snapshot);
1381 :
1382 : /*
2244 rhaas 1383 ECB : * mark parallel scan as done, so that all the workers can finish
1384 : * their scan
1385 : */
2244 rhaas 1386 GIC 385759 : _bt_parallel_done(scan);
1387 385759 : BTScanPosInvalidate(so->currPos);
1388 :
7629 tgl 1389 CBC 385759 : return false;
9445 vadim4o 1390 ECB : }
1391 : else
4316 heikki.linnakangas 1392 CBC 8668831 : PredicateLockPage(rel, BufferGetBlockNumber(buf),
1393 : scan->xs_snapshot);
1394 :
2244 rhaas 1395 8668831 : _bt_initialize_more_data(so, dir);
1396 :
1397 : /* position to the precise item on the page */
1481 pg 1398 8668831 : offnum = _bt_binsrch(rel, &inskey, buf);
1399 :
1400 : /*
6385 bruce 1401 ECB : * If nextkey = false, we are positioned at the first item >= scan key, or
1402 : * possibly at the end of a page on which all the existing items are less
1403 : * than the scan key and we know that everything on later pages is greater
1404 : * than or equal to scan key.
1405 : *
1406 : * If nextkey = true, we are positioned at the first item > scan key, or
1407 : * possibly at the end of a page on which all the existing items are less
1408 : * than or equal to the scan key and we know that everything on later
1409 : * pages is greater than scan key.
1410 : *
1411 : * The actually desired starting point is either this item or the prior
1412 : * one, or in the end-of-page case it's the first item on the next page or
1413 : * the last item on this page. Adjust the starting offset if needed. (If
1414 : * this results in an offset before the first item or after the last one,
1415 : * _bt_readpage will report no items found, and then we'll step to the
1416 : * next page as needed.)
1417 : */
7049 tgl 1418 GIC 8668831 : if (goback)
6181 1419 21567 : offnum = OffsetNumberPrev(offnum);
1420 :
2937 kgrittn 1421 ECB : /* remember which buffer we have pinned, if any */
2937 kgrittn 1422 CBC 8668831 : Assert(!BTScanPosIsValid(so->currPos));
2937 kgrittn 1423 GIC 8668831 : so->currPos.buf = buf;
1424 :
6181 tgl 1425 ECB : /*
1426 : * Now load data from the first page of the scan.
1427 : */
6181 tgl 1428 GIC 8668831 : if (!_bt_readpage(scan, dir, offnum))
1429 : {
1430 : /*
6181 tgl 1431 ECB : * There's no actually-matching data on this page. Try to advance to
1432 : * the next page. Return false if there's no matching data at all.
1433 : */
992 pg 1434 GIC 3688427 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
6181 tgl 1435 3688427 : if (!_bt_steppage(scan, dir))
7049 1436 3688414 : return false;
7049 tgl 1437 ECB : }
2937 kgrittn 1438 : else
1439 : {
1440 : /* Drop the lock, and maybe the pin, on the current page */
2937 kgrittn 1441 GIC 4980404 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1442 : }
1443 :
2244 rhaas 1444 CBC 4980421 : readcomplete:
1445 : /* OK, itemIndex says what to return */
4200 tgl 1446 GIC 4980421 : currItem = &so->currPos.items[so->currPos.itemIndex];
1490 andres 1447 CBC 4980421 : scan->xs_heaptid = currItem->heapTid;
4200 tgl 1448 GIC 4980421 : if (scan->xs_want_itup)
4200 tgl 1449 CBC 64578 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
6181 tgl 1450 ECB :
6181 tgl 1451 CBC 4980421 : return true;
6181 tgl 1452 ECB : }
1453 :
1454 : /*
1455 : * _bt_next() -- Get the next item in a scan.
1456 : *
1457 : * On entry, so->currPos describes the current page, which may be pinned
1458 : * but is not locked, and so->currPos.itemIndex identifies which item was
1459 : * previously returned.
1460 : *
1461 : * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1462 : * next heap tuple, and if requested, scan->xs_itup points to a copy of
1463 : * the index tuple. so->currPos is updated as needed.
1464 : *
1465 : * On failure exit (no more tuples), we release pin and set
1466 : * so->currPos.buf to InvalidBuffer.
1467 : */
1468 : bool
6181 tgl 1469 GIC 11957219 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1470 : {
1471 11957219 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
4200 tgl 1472 ECB : BTScanPosItem *currItem;
1473 :
6181 1474 : /*
1475 : * Advance to next tuple on current page; or if there's no more, try to
1476 : * step to the next page with data.
1477 : */
6181 tgl 1478 GIC 11957219 : if (ScanDirectionIsForward(dir))
1479 : {
1480 11948015 : if (++so->currPos.itemIndex > so->currPos.lastItem)
6181 tgl 1481 ECB : {
6181 tgl 1482 GIC 1161767 : if (!_bt_steppage(scan, dir))
6181 tgl 1483 CBC 1147466 : return false;
1484 : }
6181 tgl 1485 ECB : }
7049 1486 : else
1487 : {
6181 tgl 1488 GIC 9204 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1489 : {
1490 37 : if (!_bt_steppage(scan, dir))
7629 tgl 1491 CBC 31 : return false;
1492 : }
9445 vadim4o 1493 ECB : }
9345 bruce 1494 :
1495 : /* OK, itemIndex says what to return */
4200 tgl 1496 GIC 10809722 : currItem = &so->currPos.items[so->currPos.itemIndex];
1490 andres 1497 10809722 : scan->xs_heaptid = currItem->heapTid;
4200 tgl 1498 10809722 : if (scan->xs_want_itup)
4200 tgl 1499 CBC 1788657 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
9345 bruce 1500 ECB :
6181 tgl 1501 CBC 10809722 : return true;
6181 tgl 1502 ECB : }
1503 :
1504 : /*
1505 : * _bt_readpage() -- Load data from current index page into so->currPos
1506 : *
1507 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1508 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1509 : * they are updated as appropriate. All other fields of so->currPos are
1510 : * initialized from scratch here.
1511 : *
1512 : * We scan the current page starting at offnum and moving in the indicated
1513 : * direction. All items matching the scan keys are loaded into currPos.items.
1514 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1515 : * that there can be no more matching tuples in the current scan direction.
1516 : *
1517 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1518 : * prior to calling this function; this function will invoke
1519 : * _bt_parallel_release before returning.
1520 : *
1521 : * Returns true if any matching items found on the page, false if none.
1522 : */
1523 : static bool
6181 tgl 1524 GIC 8712708 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1525 : {
1526 8712708 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
6181 tgl 1527 ECB : Page page;
1528 : BTPageOpaque opaque;
1529 : OffsetNumber minoff;
1530 : OffsetNumber maxoff;
1531 : int itemIndex;
1532 : bool continuescan;
1533 : int indnatts;
1534 :
1535 : /*
1536 : * We must have the buffer pinned and locked, but the usual macro can't be
1537 : * used here; this function is what makes it good for currPos.
1538 : */
6181 tgl 1539 GIC 8712708 : Assert(BufferIsValid(so->currPos.buf));
1540 :
2545 kgrittn 1541 8712708 : page = BufferGetPage(so->currPos.buf);
373 michael 1542 CBC 8712708 : opaque = BTPageGetOpaque(page);
1543 :
2244 rhaas 1544 ECB : /* allow next page be processed by parallel worker */
2244 rhaas 1545 CBC 8712708 : if (scan->parallel_scan)
1546 : {
2244 rhaas 1547 GIC 644 : if (ScanDirectionIsForward(dir))
2244 rhaas 1548 CBC 644 : _bt_parallel_release(scan, opaque->btpo_next);
1549 : else
2244 rhaas 1550 LBC 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2244 rhaas 1551 ECB : }
1552 :
1478 pg 1553 GBC 8712708 : continuescan = true; /* default assumption */
1478 pg 1554 GIC 8712708 : indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
6181 tgl 1555 8712708 : minoff = P_FIRSTDATAKEY(opaque);
6181 tgl 1556 CBC 8712708 : maxoff = PageGetMaxOffsetNumber(page);
6181 tgl 1557 ECB :
2937 kgrittn 1558 : /*
1559 : * We note the buffer's block number so that we can release the pin later.
1560 : * This allows us to re-read the buffer if it is needed again for hinting.
1561 : */
2937 kgrittn 1562 GIC 8712708 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1563 :
1564 : /*
2937 kgrittn 1565 ECB : * We save the LSN of the page as we read it, so that we know whether it
1566 : * safe to apply LP_DEAD hints to the page later. This allows us to drop
1567 : * the pin for MVCC scans, which allows vacuum to avoid blocking.
1568 : */
1916 alvherre 1569 GIC 8712708 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1570 :
1571 : /*
6181 tgl 1572 ECB : * we must save the page's right-link while scanning it; this tells us
1573 : * where to step right to after we're done with these items. There is no
1574 : * corresponding need for the left-link, since splits always go right.
1575 : */
6181 tgl 1576 GIC 8712708 : so->currPos.nextPage = opaque->btpo_next;
1577 :
1578 : /* initialize tuple workspace to empty */
4200 tgl 1579 CBC 8712708 : so->currPos.nextTupleOffset = 0;
1580 :
1581 : /*
2937 kgrittn 1582 ECB : * Now that the current page has been made consistent, the macro should be
1583 : * good.
1584 : */
2937 kgrittn 1585 GIC 8712708 : Assert(BTScanPosIsPinned(so->currPos));
1586 :
6181 tgl 1587 8712708 : if (ScanDirectionIsForward(dir))
8762 bruce 1588 ECB : {
1589 : /* load items[] in ascending order */
6181 tgl 1590 CBC 8691103 : itemIndex = 0;
1591 :
6181 tgl 1592 GIC 8691103 : offnum = Max(offnum, minoff);
6181 tgl 1593 ECB :
6181 tgl 1594 GIC 30578310 : while (offnum <= maxoff)
6181 tgl 1595 ECB : {
1478 pg 1596 GIC 27999519 : ItemId iid = PageGetItemId(page, offnum);
1478 pg 1597 ECB : IndexTuple itup;
1598 :
1599 : /*
1600 : * If the scan specifies not to return killed tuples, then we
1601 : * treat a killed tuple as not passing the qual
1602 : */
1478 pg 1603 GIC 27999519 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1604 : {
1605 1157894 : offnum = OffsetNumberNext(offnum);
1478 pg 1606 CBC 1157894 : continue;
1607 : }
1478 pg 1608 ECB :
1478 pg 1609 CBC 26841625 : itup = (IndexTuple) PageGetItem(page, iid);
1610 :
1478 pg 1611 GIC 26841625 : if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
6181 tgl 1612 ECB : {
1613 : /* tuple passes all scan key conditions */
1138 pg 1614 CBC 20556895 : if (!BTreeTupleIsPosting(itup))
1615 : {
1616 : /* Remember it */
1617 20272487 : _bt_saveitem(so, itemIndex, offnum, itup);
1138 pg 1618 GIC 20272487 : itemIndex++;
1619 : }
1138 pg 1620 ECB : else
1621 : {
1622 : int tupleOffset;
1623 :
1624 : /*
1625 : * Set up state to return posting list, and remember first
1626 : * TID
1627 : */
1628 : tupleOffset =
1138 pg 1629 GIC 284408 : _bt_setuppostingitems(so, itemIndex, offnum,
1630 : BTreeTupleGetPostingN(itup, 0),
1631 : itup);
1138 pg 1632 CBC 284408 : itemIndex++;
1633 : /* Remember additional TIDs */
1138 pg 1634 GIC 1444153 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1138 pg 1635 ECB : {
1138 pg 1636 GIC 1159745 : _bt_savepostingitem(so, itemIndex, offnum,
1138 pg 1637 ECB : BTreeTupleGetPostingN(itup, i),
1638 : tupleOffset);
1138 pg 1639 CBC 1159745 : itemIndex++;
1640 : }
1641 : }
6181 tgl 1642 ECB : }
1643 : /* When !continuescan, there can't be any more matches, so stop */
6181 tgl 1644 GIC 26841625 : if (!continuescan)
1645 6112312 : break;
1646 :
6181 tgl 1647 CBC 20729313 : offnum = OffsetNumberNext(offnum);
6181 tgl 1648 ECB : }
1649 :
1478 pg 1650 : /*
1651 : * We don't need to visit page to the right when the high key
1652 : * indicates that no more matches will be found there.
1653 : *
1654 : * Checking the high key like this works out more often than you might
1655 : * think. Leaf page splits pick a split point between the two most
1656 : * dissimilar tuples (this is weighed against the need to evenly share
1657 : * free space). Leaf pages with high key attribute values that can
1658 : * only appear on non-pivot tuples on the right sibling page are
1659 : * common.
1660 : */
1478 pg 1661 GIC 8691103 : if (continuescan && !P_RIGHTMOST(opaque))
1662 : {
1663 62692 : ItemId iid = PageGetItemId(page, P_HIKEY);
1478 pg 1664 CBC 62692 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1665 : int truncatt;
1478 pg 1666 ECB :
1478 pg 1667 CBC 62692 : truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1478 pg 1668 GIC 62692 : _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1669 : }
1478 pg 1670 ECB :
1478 pg 1671 CBC 8691103 : if (!continuescan)
1478 pg 1672 GIC 6147219 : so->currPos.moreRight = false;
1673 :
1138 pg 1674 CBC 8691103 : Assert(itemIndex <= MaxTIDsPerBTreePage);
6181 tgl 1675 8691103 : so->currPos.firstItem = 0;
6181 tgl 1676 GIC 8691103 : so->currPos.lastItem = itemIndex - 1;
6181 tgl 1677 CBC 8691103 : so->currPos.itemIndex = 0;
8762 bruce 1678 ECB : }
9345 1679 : else
1680 : {
1681 : /* load items[] in descending order */
1138 pg 1682 GIC 21605 : itemIndex = MaxTIDsPerBTreePage;
1683 :
6181 tgl 1684 21605 : offnum = Min(offnum, maxoff);
6181 tgl 1685 ECB :
6181 tgl 1686 GIC 4336927 : while (offnum >= minoff)
6181 tgl 1687 ECB : {
1478 pg 1688 GIC 4315364 : ItemId iid = PageGetItemId(page, offnum);
1478 pg 1689 ECB : IndexTuple itup;
1690 : bool tuple_alive;
1691 : bool passes_quals;
1692 :
1693 : /*
1694 : * If the scan specifies not to return killed tuples, then we
1695 : * treat a killed tuple as not passing the qual. Most of the
1696 : * time, it's a win to not bother examining the tuple's index
1697 : * keys, but just skip to the next tuple (previous, actually,
1698 : * since we're scanning backwards). However, if this is the first
1699 : * tuple on the page, we do check the index keys, to prevent
1700 : * uselessly advancing to the page to the left. This is similar
1701 : * to the high key optimization used by forward scans.
1702 : */
1478 pg 1703 GIC 4315364 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1704 : {
1705 490854 : Assert(offnum >= P_FIRSTDATAKEY(opaque));
1478 pg 1706 CBC 490854 : if (offnum > P_FIRSTDATAKEY(opaque))
1707 : {
1708 490675 : offnum = OffsetNumberPrev(offnum);
1709 490675 : continue;
1710 : }
1478 pg 1711 ECB :
1478 pg 1712 CBC 179 : tuple_alive = false;
1713 : }
1714 : else
1715 3824510 : tuple_alive = true;
1716 :
1478 pg 1717 GIC 3824689 : itup = (IndexTuple) PageGetItem(page, iid);
1478 pg 1718 ECB :
1478 pg 1719 GIC 3824689 : passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1478 pg 1720 ECB : &continuescan);
1478 pg 1721 GIC 3824689 : if (passes_quals && tuple_alive)
6181 tgl 1722 ECB : {
1723 : /* tuple passes all scan key conditions */
1138 pg 1724 CBC 3824468 : if (!BTreeTupleIsPosting(itup))
1725 : {
1726 : /* Remember it */
1727 3798388 : itemIndex--;
1138 pg 1728 GIC 3798388 : _bt_saveitem(so, itemIndex, offnum, itup);
1729 : }
1138 pg 1730 ECB : else
1731 : {
1732 : int tupleOffset;
1733 :
1734 : /*
1735 : * Set up state to return posting list, and remember first
1736 : * TID.
1737 : *
1738 : * Note that we deliberately save/return items from
1739 : * posting lists in ascending heap TID order for backwards
1740 : * scans. This allows _bt_killitems() to make a
1741 : * consistent assumption about the order of items
1742 : * associated with the same posting list tuple.
1743 : */
1138 pg 1744 GIC 26080 : itemIndex--;
1745 : tupleOffset =
1746 26080 : _bt_setuppostingitems(so, itemIndex, offnum,
1138 pg 1747 ECB : BTreeTupleGetPostingN(itup, 0),
1748 : itup);
1749 : /* Remember additional TIDs */
1138 pg 1750 GIC 76589 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1751 : {
1752 50509 : itemIndex--;
1138 pg 1753 CBC 50509 : _bt_savepostingitem(so, itemIndex, offnum,
1754 : BTreeTupleGetPostingN(itup, i),
1138 pg 1755 ECB : tupleOffset);
1756 : }
1757 : }
1758 : }
6181 tgl 1759 GIC 3824689 : if (!continuescan)
1760 : {
1761 : /* there can't be any more matches, so stop */
6181 tgl 1762 CBC 42 : so->currPos.moreLeft = false;
6181 tgl 1763 GIC 42 : break;
1764 : }
6181 tgl 1765 ECB :
6181 tgl 1766 CBC 3824647 : offnum = OffsetNumberPrev(offnum);
1767 : }
1768 :
1769 21605 : Assert(itemIndex >= 0);
6181 tgl 1770 GIC 21605 : so->currPos.firstItem = itemIndex;
1138 pg 1771 21605 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1138 pg 1772 CBC 21605 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
9345 bruce 1773 ECB : }
1774 :
6181 tgl 1775 CBC 8712708 : return (so->currPos.firstItem <= so->currPos.lastItem);
1776 : }
1777 :
4200 tgl 1778 ECB : /* Save an index item into so->currPos.items[itemIndex] */
1779 : static void
4200 tgl 1780 GIC 24070875 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1781 : OffsetNumber offnum, IndexTuple itup)
1782 : {
4200 tgl 1783 CBC 24070875 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1784 :
1138 pg 1785 GIC 24070875 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1138 pg 1786 ECB :
4200 tgl 1787 GIC 24070875 : currItem->heapTid = itup->t_tid;
4200 tgl 1788 CBC 24070875 : currItem->indexOffset = offnum;
4200 tgl 1789 GIC 24070875 : if (so->currTuples)
4200 tgl 1790 ECB : {
4200 tgl 1791 CBC 9999342 : Size itupsz = IndexTupleSize(itup);
4200 tgl 1792 ECB :
4200 tgl 1793 GIC 9999342 : currItem->tupleOffset = so->currPos.nextTupleOffset;
4200 tgl 1794 CBC 9999342 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
4200 tgl 1795 GIC 9999342 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
4200 tgl 1796 ECB : }
4200 tgl 1797 CBC 24070875 : }
4200 tgl 1798 ECB :
1799 : /*
1138 pg 1800 : * Setup state to save TIDs/items from a single posting list tuple.
1801 : *
1802 : * Saves an index item into so->currPos.items[itemIndex] for TID that is
1803 : * returned to scan first. Second or subsequent TIDs for posting list should
1804 : * be saved by calling _bt_savepostingitem().
1805 : *
1806 : * Returns an offset into tuple storage space that main tuple is stored at if
1807 : * needed.
1808 : */
1809 : static int
1138 pg 1810 GIC 310488 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1811 : ItemPointer heapTid, IndexTuple itup)
1812 : {
1138 pg 1813 CBC 310488 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1814 :
1138 pg 1815 GIC 310488 : Assert(BTreeTupleIsPosting(itup));
1138 pg 1816 ECB :
1138 pg 1817 GIC 310488 : currItem->heapTid = *heapTid;
1138 pg 1818 CBC 310488 : currItem->indexOffset = offnum;
1138 pg 1819 GIC 310488 : if (so->currTuples)
1138 pg 1820 ECB : {
1821 : /* Save base IndexTuple (truncate posting list) */
1822 : IndexTuple base;
1138 pg 1823 GIC 133414 : Size itupsz = BTreeTupleGetPostingOffset(itup);
1824 :
1825 133414 : itupsz = MAXALIGN(itupsz);
1138 pg 1826 CBC 133414 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1138 pg 1827 GIC 133414 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1138 pg 1828 CBC 133414 : memcpy(base, itup, itupsz);
1138 pg 1829 ECB : /* Defensively reduce work area index tuple header size */
1138 pg 1830 CBC 133414 : base->t_info &= ~INDEX_SIZE_MASK;
1831 133414 : base->t_info |= itupsz;
1138 pg 1832 GIC 133414 : so->currPos.nextTupleOffset += itupsz;
1138 pg 1833 ECB :
1138 pg 1834 CBC 133414 : return currItem->tupleOffset;
1138 pg 1835 ECB : }
1836 :
1138 pg 1837 CBC 177074 : return 0;
1838 : }
1839 :
1138 pg 1840 ECB : /*
1841 : * Save an index item into so->currPos.items[itemIndex] for current posting
1842 : * tuple.
1843 : *
1844 : * Assumes that _bt_setuppostingitems() has already been called for current
1845 : * posting list tuple. Caller passes its return value as tupleOffset.
1846 : */
1847 : static inline void
1138 pg 1848 GIC 1210254 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1849 : ItemPointer heapTid, int tupleOffset)
1850 : {
1138 pg 1851 CBC 1210254 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1852 :
1138 pg 1853 GIC 1210254 : currItem->heapTid = *heapTid;
1138 pg 1854 CBC 1210254 : currItem->indexOffset = offnum;
1855 :
1138 pg 1856 ECB : /*
1857 : * Have index-only scans return the same base IndexTuple for every TID
1858 : * that originates from the same posting list
1859 : */
1138 pg 1860 GIC 1210254 : if (so->currTuples)
1861 465558 : currItem->tupleOffset = tupleOffset;
1862 1210254 : }
1138 pg 1863 ECB :
9770 scrappy 1864 : /*
6181 tgl 1865 : * _bt_steppage() -- Step to next page containing valid data for scan
1866 : *
1867 : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1868 : * if pinned, we'll drop the pin before moving to next page. The buffer is
1869 : * not locked on entry.
1870 : *
1871 : * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1872 : * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1873 : * to InvalidBuffer. We return true to indicate success.
1874 : */
1875 : static bool
6181 tgl 1876 GIC 4851051 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1877 : {
8297 1878 4851051 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2244 rhaas 1879 CBC 4851051 : BlockNumber blkno = InvalidBlockNumber;
1880 : bool status;
8720 bruce 1881 ECB :
2937 kgrittn 1882 CBC 4851051 : Assert(BTScanPosIsValid(so->currPos));
1883 :
1884 : /* Before leaving current page, deal with any killed items */
6181 tgl 1885 4851051 : if (so->numKilled > 0)
2937 kgrittn 1886 GIC 39535 : _bt_killitems(scan);
1887 :
6072 tgl 1888 ECB : /*
6031 bruce 1889 : * Before we modify currPos, make a copy of the page data if there was a
1890 : * mark position that needs it.
1891 : */
6072 tgl 1892 GIC 4851051 : if (so->markItemIndex >= 0)
1893 : {
1894 : /* bump pin on current buffer for assignment to mark buffer */
2937 kgrittn 1895 CBC 183 : if (BTScanPosIsPinned(so->currPos))
2937 kgrittn 1896 GIC 175 : IncrBufferRefCount(so->currPos.buf);
6072 tgl 1897 183 : memcpy(&so->markPos, &so->currPos,
6072 tgl 1898 ECB : offsetof(BTScanPosData, items[1]) +
6072 tgl 1899 CBC 183 : so->currPos.lastItem * sizeof(BTScanPosItem));
4200 1900 183 : if (so->markTuples)
4200 tgl 1901 GIC 174 : memcpy(so->markTuples, so->currTuples,
4200 tgl 1902 CBC 174 : so->currPos.nextTupleOffset);
6072 1903 183 : so->markPos.itemIndex = so->markItemIndex;
1904 183 : so->markItemIndex = -1;
6072 tgl 1905 ECB : }
1906 :
9345 bruce 1907 CBC 4851051 : if (ScanDirectionIsForward(dir))
1908 : {
1909 : /* Walk right to the next page with data */
2244 rhaas 1910 4851005 : if (scan->parallel_scan != NULL)
1911 : {
1912 : /*
2244 rhaas 1913 ECB : * Seize the scan to get the next block number; if the scan has
1914 : * ended already, bail out.
1915 : */
2244 rhaas 1916 GIC 644 : status = _bt_parallel_seize(scan, &blkno);
1917 644 : if (!status)
1918 : {
2244 rhaas 1919 ECB : /* release the previous buffer, if pinned */
2244 rhaas 1920 CBC 6 : BTScanPosUnpinIfPinned(so->currPos);
2244 rhaas 1921 GIC 6 : BTScanPosInvalidate(so->currPos);
1922 6 : return false;
2244 rhaas 1923 ECB : }
1924 : }
1925 : else
1926 : {
1927 : /* Not parallel, so use the previously-saved nextPage link. */
2244 rhaas 1928 GIC 4850361 : blkno = so->currPos.nextPage;
1929 : }
1930 :
6181 tgl 1931 ECB : /* Remember we left a page with data */
6181 tgl 1932 GIC 4850999 : so->currPos.moreLeft = true;
1933 :
1934 : /* release the previous buffer, if pinned */
2937 kgrittn 1935 CBC 4850999 : BTScanPosUnpinIfPinned(so->currPos);
1936 : }
1937 : else
2244 rhaas 1938 ECB : {
1939 : /* Remember we left a page with data */
2244 rhaas 1940 GIC 46 : so->currPos.moreRight = true;
1941 :
1942 46 : if (scan->parallel_scan != NULL)
2244 rhaas 1943 ECB : {
1944 : /*
1945 : * Seize the scan to get the current block number; if the scan has
1946 : * ended already, bail out.
1947 : */
2244 rhaas 1948 UIC 0 : status = _bt_parallel_seize(scan, &blkno);
1949 0 : BTScanPosUnpinIfPinned(so->currPos);
1950 0 : if (!status)
2244 rhaas 1951 EUB : {
2244 rhaas 1952 UBC 0 : BTScanPosInvalidate(so->currPos);
1953 0 : return false;
1954 : }
2244 rhaas 1955 EUB : }
1956 : else
1957 : {
1958 : /* Not parallel, so just use our own notion of the current page */
2244 rhaas 1959 GIC 46 : blkno = so->currPos.currPage;
1960 : }
1961 : }
2244 rhaas 1962 ECB :
2244 rhaas 1963 GIC 4851045 : if (!_bt_readnextpage(scan, blkno, dir))
1964 4836686 : return false;
1965 :
2244 rhaas 1966 ECB : /* Drop the lock, and maybe the pin, on the current page */
2244 rhaas 1967 CBC 14359 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1968 :
2244 rhaas 1969 GIC 14359 : return true;
2244 rhaas 1970 ECB : }
1971 :
1972 : /*
1973 : * _bt_readnextpage() -- Read next page containing valid data for scan
1974 : *
1975 : * On success exit, so->currPos is updated to contain data from the next
1976 : * interesting page. Caller is responsible to release lock and pin on
1977 : * buffer on success. We return true to indicate success.
1978 : *
1979 : * If there are no more matching records in the given direction, we drop all
1980 : * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1981 : */
1982 : static bool
2244 rhaas 1983 GIC 4851049 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1984 : {
1985 4851049 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2244 rhaas 1986 ECB : Relation rel;
1987 : Page page;
1988 : BTPageOpaque opaque;
1989 : bool status;
1990 :
2244 rhaas 1991 GIC 4851049 : rel = scan->indexRelation;
1992 :
1993 4851049 : if (ScanDirectionIsForward(dir))
2244 rhaas 1994 ECB : {
1995 : for (;;)
9345 bruce 1996 : {
1997 : /*
1998 : * if we're at end of scan, give up and mark parallel scan as
1999 : * done, so that all the workers can finish their scan
2000 : */
6181 tgl 2001 GIC 4851755 : if (blkno == P_NONE || !so->currPos.moreRight)
2002 : {
2244 rhaas 2003 4836652 : _bt_parallel_done(scan);
2937 kgrittn 2004 CBC 4836652 : BTScanPosInvalidate(so->currPos);
6181 tgl 2005 GIC 4836652 : return false;
2937 kgrittn 2006 ECB : }
5087 tgl 2007 : /* check for interrupts while we're not holding any buffer lock */
5087 tgl 2008 CBC 15103 : CHECK_FOR_INTERRUPTS();
2009 : /* step right one page */
8 andres 2010 GNC 15103 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ);
2545 kgrittn 2011 CBC 15103 : page = BufferGetPage(so->currPos.buf);
2545 kgrittn 2012 GIC 15103 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
373 michael 2013 CBC 15103 : opaque = BTPageGetOpaque(page);
2259 rhaas 2014 ECB : /* check for deleted page */
6181 tgl 2015 CBC 15103 : if (!P_IGNORE(opaque))
9345 bruce 2016 ECB : {
4316 heikki.linnakangas 2017 GIC 15103 : PredicateLockPage(rel, blkno, scan->xs_snapshot);
6181 tgl 2018 ECB : /* see if there are any matches on this page */
2019 : /* note that this will clear moreRight if we can stop */
6181 tgl 2020 CBC 15103 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
6181 tgl 2021 GIC 14351 : break;
2022 : }
1943 rhaas 2023 LBC 0 : else if (scan->parallel_scan != NULL)
1943 rhaas 2024 ECB : {
2025 : /* allow next page be processed by parallel worker */
1943 rhaas 2026 UBC 0 : _bt_parallel_release(scan, opaque->btpo_next);
2027 : }
2028 :
6181 tgl 2029 EUB : /* nope, keep going */
2244 rhaas 2030 GIC 752 : if (scan->parallel_scan != NULL)
2031 : {
1717 akapila 2032 UIC 0 : _bt_relbuf(rel, so->currPos.buf);
2244 rhaas 2033 LBC 0 : status = _bt_parallel_seize(scan, &blkno);
2244 rhaas 2034 UIC 0 : if (!status)
2244 rhaas 2035 EUB : {
2244 rhaas 2036 UBC 0 : BTScanPosInvalidate(so->currPos);
2037 0 : return false;
2038 : }
2244 rhaas 2039 EUB : }
2040 : else
2041 : {
2244 rhaas 2042 GIC 752 : blkno = opaque->btpo_next;
1717 akapila 2043 752 : _bt_relbuf(rel, so->currPos.buf);
2044 : }
9770 scrappy 2045 ECB : }
2046 : }
2047 : else
2048 : {
2049 : /*
2050 : * Should only happen in parallel cases, when some other backend
2051 : * advanced the scan.
2052 : */
2244 rhaas 2053 GIC 46 : if (so->currPos.currPage != blkno)
2054 : {
2244 rhaas 2055 UIC 0 : BTScanPosUnpinIfPinned(so->currPos);
2244 rhaas 2056 LBC 0 : so->currPos.currPage = blkno;
2057 : }
6181 tgl 2058 EUB :
2059 : /*
2060 : * Walk left to the next page with data. This is much more complex
2061 : * than the walk-right case because of the possibility that the page
2062 : * to our left splits while we are in flight to it, plus the
2063 : * possibility that the page we were on gets deleted after we leave
2064 : * it. See nbtree/README for details.
2065 : *
2066 : * It might be possible to rearrange this code to have less overhead
2067 : * in pinning and locking, but that would require capturing the left
2068 : * pointer when the page is initially read, and using it here, along
2069 : * with big changes to _bt_walk_left() and the code below. It is not
2070 : * clear whether this would be a win, since if the page immediately to
2071 : * the left splits after we read this page and before we step left, we
2072 : * would need to visit more pages than with the current code.
2073 : *
2074 : * Note that if we change the code so that we drop the pin for a scan
2075 : * which uses a non-MVCC snapshot, we will need to modify the code for
2076 : * walking left, to allow for the possibility that a referenced page
2077 : * has been deleted. As long as the buffer is pinned or the snapshot
2078 : * is MVCC the page cannot move past the half-dead state to fully
2079 : * deleted.
2080 : */
2937 kgrittn 2081 GIC 46 : if (BTScanPosIsPinned(so->currPos))
992 pg 2082 24 : _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2083 : else
8 andres 2084 GNC 22 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation,
2085 : so->currPos.currPage, BT_READ);
2937 kgrittn 2086 ECB :
2087 : for (;;)
9345 bruce 2088 : {
2089 : /* Done if we know there are no matching keys to the left */
6181 tgl 2090 GIC 48 : if (!so->currPos.moreLeft)
2091 : {
2092 21 : _bt_relbuf(rel, so->currPos.buf);
2244 rhaas 2093 21 : _bt_parallel_done(scan);
2937 kgrittn 2094 CBC 21 : BTScanPosInvalidate(so->currPos);
6181 tgl 2095 GIC 21 : return false;
6181 tgl 2096 ECB : }
7351 2097 :
6181 2098 : /* Step to next physical page */
8 andres 2099 GNC 27 : so->currPos.buf = _bt_walk_left(rel, scan->heapRelation,
2100 : so->currPos.buf, scan->xs_snapshot);
2101 :
2102 : /* if we're physically at end of index, return failure */
6181 tgl 2103 CBC 27 : if (so->currPos.buf == InvalidBuffer)
2104 : {
2244 rhaas 2105 GIC 13 : _bt_parallel_done(scan);
2937 kgrittn 2106 13 : BTScanPosInvalidate(so->currPos);
6181 tgl 2107 CBC 13 : return false;
2108 : }
6181 tgl 2109 ECB :
2110 : /*
6031 bruce 2111 : * Okay, we managed to move left to a non-deleted page. Done if
2112 : * it's not half-dead and contains matching tuples. Else loop back
2113 : * and do it all again.
2114 : */
2545 kgrittn 2115 GIC 14 : page = BufferGetPage(so->currPos.buf);
2116 14 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
373 michael 2117 14 : opaque = BTPageGetOpaque(page);
6181 tgl 2118 14 : if (!P_IGNORE(opaque))
6181 tgl 2119 ECB : {
4316 heikki.linnakangas 2120 CBC 14 : PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
6181 tgl 2121 ECB : /* see if there are any matches on this page */
2122 : /* note that this will clear moreLeft if we can stop */
6181 tgl 2123 GIC 14 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
6181 tgl 2124 CBC 12 : break;
2125 : }
1943 rhaas 2126 UIC 0 : else if (scan->parallel_scan != NULL)
1943 rhaas 2127 ECB : {
2128 : /* allow next page be processed by parallel worker */
1943 rhaas 2129 UIC 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1943 rhaas 2130 EUB : }
2131 :
2132 : /*
2244 2133 : * For parallel scans, get the last page scanned as it is quite
2134 : * possible that by the time we try to seize the scan, some other
2135 : * worker has already advanced the scan to a different page. We
2136 : * must continue based on the latest page scanned by any worker.
2137 : */
2244 rhaas 2138 GIC 2 : if (scan->parallel_scan != NULL)
2139 : {
2244 rhaas 2140 UIC 0 : _bt_relbuf(rel, so->currPos.buf);
2141 0 : status = _bt_parallel_seize(scan, &blkno);
2244 rhaas 2142 LBC 0 : if (!status)
2143 : {
2244 rhaas 2144 UBC 0 : BTScanPosInvalidate(so->currPos);
2145 0 : return false;
2244 rhaas 2146 EUB : }
8 andres 2147 UNC 0 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno,
2148 : BT_READ);
2244 rhaas 2149 EUB : }
9770 scrappy 2150 : }
2151 : }
8297 tgl 2152 :
2244 rhaas 2153 GIC 14363 : return true;
2154 : }
2155 :
2156 : /*
2157 : * _bt_parallel_readpage() -- Read current page containing valid data for scan
2244 rhaas 2158 ECB : *
2159 : * On success, release lock and maybe pin on buffer. We return true to
2160 : * indicate success.
2161 : */
2162 : static bool
2244 rhaas 2163 GIC 4 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2164 : {
2165 4 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2166 :
2167 4 : _bt_initialize_more_data(so, dir);
2244 rhaas 2168 ECB :
2244 rhaas 2169 GIC 4 : if (!_bt_readnextpage(scan, blkno, dir))
2244 rhaas 2170 LBC 0 : return false;
2171 :
2937 kgrittn 2172 ECB : /* Drop the lock, and maybe the pin, on the current page */
2937 kgrittn 2173 GIC 4 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2937 kgrittn 2174 ECB :
8986 bruce 2175 GBC 4 : return true;
2176 : }
2177 :
7351 tgl 2178 ECB : /*
2179 : * _bt_walk_left() -- step left one page, if possible
2180 : *
2181 : * The given buffer must be pinned and read-locked. This will be dropped
2182 : * before stepping left. On return, we have pin and read lock on the
2183 : * returned page, instead.
2184 : *
2185 : * Returns InvalidBuffer if there is no page to the left (no lock is held
2186 : * in that case).
2187 : *
2188 : * When working on a non-leaf level, it is possible for the returned page
2189 : * to be half-dead; the caller should check that condition and step left
2190 : * again if it's important.
2191 : */
2192 : static Buffer
8 andres 2193 GNC 27 : _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot)
2194 : {
2195 : Page page;
2196 : BTPageOpaque opaque;
2197 :
2545 kgrittn 2198 CBC 27 : page = BufferGetPage(buf);
373 michael 2199 GIC 27 : opaque = BTPageGetOpaque(page);
2200 :
2201 : for (;;)
7351 tgl 2202 UIC 0 : {
7351 tgl 2203 ECB : BlockNumber obknum;
2204 : BlockNumber lblkno;
2205 : BlockNumber blkno;
2206 : int tries;
7351 tgl 2207 EUB :
2208 : /* if we're at end of tree, release buf and return failure */
7351 tgl 2209 GIC 27 : if (P_LEFTMOST(opaque))
2210 : {
2211 13 : _bt_relbuf(rel, buf);
2212 13 : break;
2213 : }
7351 tgl 2214 ECB : /* remember original page we are stepping left from */
7351 tgl 2215 GIC 14 : obknum = BufferGetBlockNumber(buf);
7351 tgl 2216 ECB : /* step left */
7351 tgl 2217 CBC 14 : blkno = lblkno = opaque->btpo_prev;
5087 tgl 2218 GIC 14 : _bt_relbuf(rel, buf);
2219 : /* check for interrupts while we're not holding any buffer lock */
5087 tgl 2220 CBC 14 : CHECK_FOR_INTERRUPTS();
8 andres 2221 GNC 14 : buf = _bt_getbuf(rel, heaprel, blkno, BT_READ);
2545 kgrittn 2222 CBC 14 : page = BufferGetPage(buf);
2223 14 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2224 GIC 14 : opaque = BTPageGetOpaque(page);
7188 bruce 2225 ECB :
7351 tgl 2226 : /*
7188 bruce 2227 : * If this isn't the page we want, walk right till we find what we
6385 2228 : * want --- but go no more than four hops (an arbitrary limit). If we
2229 : * don't find the correct page by then, the most likely bet is that
2230 : * the original page got deleted and isn't in the sibling chain at all
2231 : * anymore, not that its left sibling got split more than four times.
2232 : *
2233 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2234 : * because half-dead pages are still in the sibling chain. Caller
2235 : * must reject half-dead pages if wanted.
2236 : */
7351 tgl 2237 GIC 14 : tries = 0;
2238 : for (;;)
2239 : {
2240 14 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2241 : {
7351 tgl 2242 ECB : /* Found desired page, return it */
7351 tgl 2243 GIC 14 : return buf;
2244 : }
7351 tgl 2245 LBC 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2246 : break;
7351 tgl 2247 UIC 0 : blkno = opaque->btpo_next;
6927 tgl 2248 LBC 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2545 kgrittn 2249 UIC 0 : page = BufferGetPage(buf);
2545 kgrittn 2250 UBC 0 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2251 UIC 0 : opaque = BTPageGetOpaque(page);
7351 tgl 2252 EUB : }
2253 :
2254 : /* Return to the original page to see what's up */
6927 tgl 2255 UBC 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2545 kgrittn 2256 0 : page = BufferGetPage(buf);
2545 kgrittn 2257 UIC 0 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2258 0 : opaque = BTPageGetOpaque(page);
7351 tgl 2259 0 : if (P_ISDELETED(opaque))
7351 tgl 2260 EUB : {
2261 : /*
3260 bruce 2262 : * It was deleted. Move right to first nondeleted page (there
6385 2263 : * must be one); that is the page that has acquired the deleted
2264 : * one's keyspace, so stepping left from it will take us where we
2265 : * want to be.
2266 : */
2267 : for (;;)
2268 : {
7351 tgl 2269 UIC 0 : if (P_RIGHTMOST(opaque))
5578 2270 0 : elog(ERROR, "fell off the end of index \"%s\"",
2271 : RelationGetRelationName(rel));
7351 2272 0 : blkno = opaque->btpo_next;
6927 2273 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2545 kgrittn 2274 UBC 0 : page = BufferGetPage(buf);
2275 0 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2276 UIC 0 : opaque = BTPageGetOpaque(page);
7351 tgl 2277 UBC 0 : if (!P_ISDELETED(opaque))
2278 0 : break;
7351 tgl 2279 EUB : }
7188 bruce 2280 :
7351 tgl 2281 : /*
6385 bruce 2282 : * Now return to top of loop, resetting obknum to point to this
2283 : * nondeleted page, and try again.
2284 : */
2285 : }
2286 : else
2287 : {
2288 : /*
2289 : * It wasn't deleted; the explanation had better be that the page
2290 : * to the left got split or deleted. Without this check, we'd go
2291 : * into an infinite loop if there's anything wrong.
2292 : */
7351 tgl 2293 UIC 0 : if (opaque->btpo_prev == lblkno)
5578 2294 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2295 : obknum, RelationGetRelationName(rel));
2296 : /* Okay to try again with new lblkno value */
2297 : }
7351 tgl 2298 EUB : }
2299 :
7351 tgl 2300 GIC 13 : return InvalidBuffer;
2301 : }
2302 :
2303 : /*
2304 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
7352 tgl 2305 ECB : *
2306 : * If the index is empty, we will return InvalidBuffer; any other failure
2307 : * condition causes ereport(). We will not return a dead page.
2308 : *
2309 : * The returned buffer is pinned and read-locked.
2310 : */
2311 : Buffer
8 andres 2312 GNC 31969 : _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost,
2313 : Snapshot snapshot)
2314 : {
2315 : Buffer buf;
2316 : Page page;
7352 tgl 2317 ECB : BTPageOpaque opaque;
2318 : OffsetNumber offnum;
2319 : BlockNumber blkno;
2320 : IndexTuple itup;
2321 :
2322 : /*
2323 : * If we are looking for a leaf page, okay to descend from fast root;
2324 : * otherwise better descend from true root. (There is no point in being
2325 : * smarter about intermediate levels.)
2326 : */
7352 tgl 2327 GIC 31969 : if (level == 0)
8 andres 2328 GNC 31957 : buf = _bt_getroot(rel, heaprel, BT_READ);
2329 : else
2330 12 : buf = _bt_gettrueroot(rel, heaprel);
2331 :
7352 tgl 2332 CBC 31969 : if (!BufferIsValid(buf))
2333 3197 : return InvalidBuffer;
2334 :
2545 kgrittn 2335 28772 : page = BufferGetPage(buf);
2545 kgrittn 2336 GIC 28772 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2337 CBC 28772 : opaque = BTPageGetOpaque(page);
7352 tgl 2338 ECB :
2339 : for (;;)
2340 : {
2341 : /*
2342 : * If we landed on a deleted page, step right to find a live page
2343 : * (there must be one). Also, if we want the rightmost page, step
2344 : * right if needed to get to it (this could happen if the page split
2345 : * since we obtained a pointer to it).
2346 : */
7351 tgl 2347 GIC 42145 : while (P_IGNORE(opaque) ||
7352 2348 27 : (rightmost && !P_RIGHTMOST(opaque)))
2349 : {
7352 tgl 2350 UIC 0 : blkno = opaque->btpo_next;
2351 0 : if (blkno == P_NONE)
5578 tgl 2352 LBC 0 : elog(ERROR, "fell off the end of index \"%s\"",
7351 tgl 2353 ECB : RelationGetRelationName(rel));
6927 tgl 2354 UIC 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2545 kgrittn 2355 UBC 0 : page = BufferGetPage(buf);
2356 0 : TestForOldSnapshot(snapshot, rel, page);
373 michael 2357 0 : opaque = BTPageGetOpaque(page);
2358 : }
7352 tgl 2359 EUB :
2360 : /* Done? */
774 pg 2361 GBC 42145 : if (opaque->btpo_level == level)
7352 tgl 2362 28772 : break;
774 pg 2363 GIC 13373 : if (opaque->btpo_level < level)
1347 peter 2364 UIC 0 : ereport(ERROR,
2365 : (errcode(ERRCODE_INDEX_CORRUPTED),
1347 peter 2366 ECB : errmsg_internal("btree level %u not found in index \"%s\"",
2367 : level, RelationGetRelationName(rel))));
7352 tgl 2368 :
7351 tgl 2369 EUB : /* Descend to leftmost or rightmost child page */
7352 tgl 2370 GIC 13373 : if (rightmost)
2371 3 : offnum = PageGetMaxOffsetNumber(page);
2372 : else
2373 13370 : offnum = P_FIRSTDATAKEY(opaque);
2374 :
6283 tgl 2375 CBC 13373 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1210 pg 2376 13373 : blkno = BTreeTupleGetDownLink(itup);
2377 :
6927 tgl 2378 13373 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2545 kgrittn 2379 GIC 13373 : page = BufferGetPage(buf);
373 michael 2380 CBC 13373 : opaque = BTPageGetOpaque(page);
7352 tgl 2381 ECB : }
2382 :
7352 tgl 2383 CBC 28772 : return buf;
7352 tgl 2384 ECB : }
2385 :
2386 : /*
2387 : * _bt_endpoint() -- Find the first or last page in the index, and scan
7088 2388 : * from there to the first key satisfying all the quals.
2389 : *
2390 : * This is used by _bt_first() to set up a scan when we've determined
2391 : * that the scan must start at the beginning or end of the index (for
2392 : * a forward or backward scan respectively). Exit conditions are the
2393 : * same as for _bt_first().
2394 : */
2395 : static bool
9770 scrappy 2396 GIC 31957 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2397 : {
6181 tgl 2398 31957 : Relation rel = scan->indexRelation;
2399 31957 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2400 : Buffer buf;
9344 bruce 2401 ECB : Page page;
2402 : BTPageOpaque opaque;
8297 tgl 2403 : OffsetNumber start;
4200 2404 : BTScanPosItem *currItem;
2405 :
2406 : /*
2407 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2408 : * version of _bt_search(). We don't maintain a stack since we know we
2409 : * won't need it.
2410 : */
8 andres 2411 GNC 31957 : buf = _bt_get_endpoint(rel, scan->heapRelation, 0,
2412 : ScanDirectionIsBackward(dir), scan->xs_snapshot);
2413 :
8053 bruce 2414 GIC 31957 : if (!BufferIsValid(buf))
2415 : {
2416 : /*
4316 heikki.linnakangas 2417 ECB : * Empty index. Lock the whole relation, as nothing finer to lock
2418 : * exists.
2419 : */
4316 heikki.linnakangas 2420 CBC 3197 : PredicateLockRelation(rel, scan->xs_snapshot);
2937 kgrittn 2421 GIC 3197 : BTScanPosInvalidate(so->currPos);
7629 tgl 2422 3197 : return false;
2423 : }
2424 :
4316 heikki.linnakangas 2425 28760 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2545 kgrittn 2426 CBC 28760 : page = BufferGetPage(buf);
373 michael 2427 28760 : opaque = BTPageGetOpaque(page);
7352 tgl 2428 28760 : Assert(P_ISLEAF(opaque));
2429 :
9345 bruce 2430 GIC 28760 : if (ScanDirectionIsForward(dir))
9345 bruce 2431 ECB : {
7352 tgl 2432 : /* There could be dead pages to the left, so not this: */
2433 : /* Assert(P_LEFTMOST(opaque)); */
9345 bruce 2434 :
8297 tgl 2435 GIC 28736 : start = P_FIRSTDATAKEY(opaque);
9345 bruce 2436 ECB : }
9345 bruce 2437 GIC 24 : else if (ScanDirectionIsBackward(dir))
2438 : {
8297 tgl 2439 24 : Assert(P_RIGHTMOST(opaque));
2440 :
8297 tgl 2441 CBC 24 : start = PageGetMaxOffsetNumber(page);
2442 : }
9345 bruce 2443 ECB : else
2444 : {
7202 tgl 2445 LBC 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2446 : start = 0; /* keep compiler quiet */
8297 tgl 2447 ECB : }
2448 :
2449 : /* remember which buffer we have pinned */
6181 tgl 2450 GIC 28760 : so->currPos.buf = buf;
8297 tgl 2451 EUB :
2244 rhaas 2452 GIC 28760 : _bt_initialize_more_data(so, dir);
2453 :
2454 : /*
2455 : * Now load data from the first page of the scan.
7088 tgl 2456 ECB : */
6181 tgl 2457 GIC 28760 : if (!_bt_readpage(scan, dir, start))
9620 vadim4o 2458 ECB : {
2459 : /*
2460 : * There's no actually-matching data on this page. Try to advance to
2461 : * the next page. Return false if there's no matching data at all.
2462 : */
992 pg 2463 CBC 820 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
6181 tgl 2464 GIC 820 : if (!_bt_steppage(scan, dir))
2465 781 : return false;
2466 : }
2467 : else
2468 : {
2937 kgrittn 2469 ECB : /* Drop the lock, and maybe the pin, on the current page */
2937 kgrittn 2470 CBC 27940 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2937 kgrittn 2471 ECB : }
2472 :
2473 : /* OK, itemIndex says what to return */
4200 tgl 2474 GIC 27979 : currItem = &so->currPos.items[so->currPos.itemIndex];
1490 andres 2475 27979 : scan->xs_heaptid = currItem->heapTid;
4200 tgl 2476 CBC 27979 : if (scan->xs_want_itup)
4200 tgl 2477 GIC 25940 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2478 :
6181 2479 27979 : return true;
9770 scrappy 2480 ECB : }
2244 rhaas 2481 :
2482 : /*
2483 : * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2484 : * for scan direction
2485 : */
2486 : static inline void
2244 rhaas 2487 GIC 8697595 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2488 : {
2489 : /* initialize moreLeft/moreRight appropriately for scan direction */
2490 8697595 : if (ScanDirectionIsForward(dir))
2491 : {
2492 8676004 : so->currPos.moreLeft = false;
2244 rhaas 2493 CBC 8676004 : so->currPos.moreRight = true;
2494 : }
2495 : else
2244 rhaas 2496 ECB : {
2244 rhaas 2497 GIC 21591 : so->currPos.moreLeft = true;
2244 rhaas 2498 CBC 21591 : so->currPos.moreRight = false;
2244 rhaas 2499 ECB : }
2244 rhaas 2500 GIC 8697595 : so->numKilled = 0; /* just paranoia */
2501 8697595 : so->markItemIndex = -1; /* ditto */
2502 8697595 : }
|