TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtsearch.c
4 : * Search code for postgres btrees.
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/nbtree/nbtsearch.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "access/nbtree.h"
19 : #include "access/relscan.h"
20 : #include "miscadmin.h"
21 : #include "pgstat.h"
22 : #include "storage/predicate.h"
23 : #include "utils/lsyscache.h"
24 : #include "utils/rel.h"
25 :
26 :
27 : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28 : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
29 : static int _bt_binsrch_posting(BTScanInsert key, Page page,
30 : OffsetNumber offnum);
31 : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
32 : OffsetNumber offnum);
33 : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
34 : OffsetNumber offnum, IndexTuple itup);
35 : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
36 : OffsetNumber offnum, ItemPointer heapTid,
37 : IndexTuple itup);
38 : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
39 : OffsetNumber offnum,
40 : ItemPointer heapTid, int tupleOffset);
41 : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
42 : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
43 : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
44 : ScanDirection dir);
45 : static Buffer _bt_walk_left(Relation rel, Relation heaprel, Buffer buf,
46 : Snapshot snapshot);
47 : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
48 : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
49 :
50 :
51 : /*
52 : * _bt_drop_lock_and_maybe_pin()
53 : *
54 : * Unlock the buffer; and if it is safe to release the pin, do that, too.
55 : * This will prevent vacuum from stalling in a blocked state trying to read a
56 : * page when a cursor is sitting on it.
57 : *
58 : * See nbtree/README section on making concurrent TID recycling safe.
59 : */
60 : static void
61 GIC 5022230 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
62 ECB : {
63 GIC 5022230 : _bt_unlockbuf(scan->indexRelation, sp->buf);
64 ECB :
65 GIC 5022230 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
66 CBC 4849383 : RelationNeedsWAL(scan->indexRelation) &&
67 4846854 : !scan->xs_want_itup)
68 ECB : {
69 GIC 4797883 : ReleaseBuffer(sp->buf);
70 CBC 4797883 : sp->buf = InvalidBuffer;
71 ECB : }
72 GIC 5022230 : }
73 ECB :
74 : /*
75 : * _bt_search() -- Search the tree for a particular scankey,
76 : * or more precisely for the first leaf page it could be on.
77 : *
78 : * The passed scankey is an insertion-type scankey (see nbtree/README),
79 : * but it can omit the rightmost column(s) of the index.
80 : *
81 : * Return value is a stack of parent-page pointers (i.e. there is no entry for
82 : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
83 : * which is locked and pinned. No locks are held on the parent pages,
84 : * however!
85 : *
86 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
87 : * place during the descent through the tree. This is not needed when
88 : * positioning for an insert or delete, so NULL is used for those cases.
89 : *
90 : * The returned buffer is locked according to access parameter. Additionally,
91 : * access = BT_WRITE will allow an empty root page to be created and returned.
92 : * When access = BT_READ, an empty index will result in *bufP being set to
93 : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
94 : * during the search will be finished.
95 : */
96 : BTStack
97 GNC 16390240 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
98 : int access, Snapshot snapshot)
99 : {
100 GIC 16390240 : BTStack stack_in = NULL;
101 CBC 16390240 : int page_access = BT_READ;
102 ECB :
103 : /* Get the root page to start with */
104 GNC 16390240 : *bufP = _bt_getroot(rel, heaprel, access);
105 ECB :
106 : /* If index is empty and access = BT_READ, no root page is created. */
107 GIC 16390240 : if (!BufferIsValid(*bufP))
108 CBC 385754 : return (BTStack) NULL;
109 ECB :
110 : /* Loop iterates once per level descended in the tree */
111 : for (;;)
112 GIC 12274663 : {
113 ECB : Page page;
114 : BTPageOpaque opaque;
115 : OffsetNumber offnum;
116 : ItemId itemid;
117 : IndexTuple itup;
118 : BlockNumber child;
119 : BTStack new_stack;
120 :
121 : /*
122 : * Race -- the page we just grabbed may have split since we read its
123 : * downlink in its parent page (or the metapage). If it has, we may
124 : * need to move right to its new sibling. Do that.
125 : *
126 : * In write-mode, allow _bt_moveright to finish any incomplete splits
127 : * along the way. Strictly speaking, we'd only need to finish an
128 : * incomplete split on the leaf page we're about to insert to, not on
129 : * any of the upper levels (internal pages with incomplete splits are
130 : * also taken care of in _bt_getstackbuf). But this is a good
131 : * opportunity to finish splits of internal pages too.
132 : */
133 GNC 28279149 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
134 : stack_in, page_access, snapshot);
135 :
136 : /* if this is a leaf page, we're done */
137 GIC 28279149 : page = BufferGetPage(*bufP);
138 CBC 28279149 : opaque = BTPageGetOpaque(page);
139 28279149 : if (P_ISLEAF(opaque))
140 16004486 : break;
141 ECB :
142 : /*
143 : * Find the appropriate pivot tuple on this page. Its downlink points
144 : * to the child page that we're about to descend to.
145 : */
146 GIC 12274663 : offnum = _bt_binsrch(rel, key, *bufP);
147 CBC 12274663 : itemid = PageGetItemId(page, offnum);
148 12274663 : itup = (IndexTuple) PageGetItem(page, itemid);
149 12274663 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
150 12274663 : child = BTreeTupleGetDownLink(itup);
151 ECB :
152 : /*
153 : * We need to save the location of the pivot tuple we chose in a new
154 : * stack entry for this page/level. If caller ends up splitting a
155 : * page one level down, it usually ends up inserting a new pivot
156 : * tuple/downlink immediately after the location recorded here.
157 : */
158 GIC 12274663 : new_stack = (BTStack) palloc(sizeof(BTStackData));
159 CBC 12274663 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
160 12274663 : new_stack->bts_offset = offnum;
161 12274663 : new_stack->bts_parent = stack_in;
162 ECB :
163 : /*
164 : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
165 : * we're on the level 1 and asked to lock leaf page in write mode,
166 : * then lock next page in write mode, because it must be a leaf.
167 : */
168 GIC 12274663 : if (opaque->btpo_level == 1 && access == BT_WRITE)
169 CBC 5665181 : page_access = BT_WRITE;
170 ECB :
171 : /* drop the read lock on the page, then acquire one on its child */
172 GIC 12274663 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
173 ECB :
174 : /* okay, all set to move down a level */
175 GIC 12274663 : stack_in = new_stack;
176 ECB : }
177 :
178 : /*
179 : * If we're asked to lock leaf in write mode, but didn't manage to, then
180 : * relock. This should only happen when the root page is a leaf page (and
181 : * the only page in the index other than the metapage).
182 : */
183 GIC 16004486 : if (access == BT_WRITE && page_access == BT_READ)
184 ECB : {
185 : /* trade in our read lock for a write lock */
186 GIC 1468243 : _bt_unlockbuf(rel, *bufP);
187 CBC 1468243 : _bt_lockbuf(rel, *bufP, BT_WRITE);
188 ECB :
189 : /*
190 : * Race -- the leaf page may have split after we dropped the read lock
191 : * but before we acquired a write lock. If it has, we may need to
192 : * move right to its new sibling. Do that.
193 : */
194 GNC 1468243 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE,
195 ECB : snapshot);
196 : }
197 :
198 GIC 16004486 : return stack_in;
199 ECB : }
200 :
201 : /*
202 : * _bt_moveright() -- move right in the btree if necessary.
203 : *
204 : * When we follow a pointer to reach a page, it is possible that
205 : * the page has changed in the meanwhile. If this happens, we're
206 : * guaranteed that the page has "split right" -- that is, that any
207 : * data that appeared on the page originally is either on the page
208 : * or strictly to the right of it.
209 : *
210 : * This routine decides whether or not we need to move right in the
211 : * tree by examining the high key entry on the page. If that entry is
212 : * strictly less than the scankey, or <= the scankey in the
213 : * key.nextkey=true case, then we followed the wrong link and we need
214 : * to move right.
215 : *
216 : * The passed insertion-type scankey can omit the rightmost column(s) of the
217 : * index. (see nbtree/README)
218 : *
219 : * When key.nextkey is false (the usual case), we are looking for the first
220 : * item >= key. When key.nextkey is true, we are looking for the first item
221 : * strictly greater than key.
222 : *
223 : * If forupdate is true, we will attempt to finish any incomplete splits
224 : * that we encounter. This is required when locking a target page for an
225 : * insertion, because we don't allow inserting on a page before the split
226 : * is completed. 'stack' is only used if forupdate is true.
227 : *
228 : * On entry, we have the buffer pinned and a lock of the type specified by
229 : * 'access'. If we move right, we release the buffer and lock and acquire
230 : * the same on the right sibling. Return value is the buffer we stop at.
231 : *
232 : * If the snapshot parameter is not NULL, "old snapshot" checking will take
233 : * place during the descent through the tree. This is not needed when
234 : * positioning for an insert or delete, so NULL is used for those cases.
235 : */
236 : Buffer
237 GIC 29747392 : _bt_moveright(Relation rel,
238 : Relation heaprel,
239 ECB : BTScanInsert key,
240 : Buffer buf,
241 : bool forupdate,
242 : BTStack stack,
243 : int access,
244 : Snapshot snapshot)
245 : {
246 : Page page;
247 : BTPageOpaque opaque;
248 : int32 cmpval;
249 :
250 : /*
251 : * When nextkey = false (normal case): if the scan key that brought us to
252 : * this page is > the high key stored on the page, then the page has split
253 : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
254 : * have some duplicates to the right as well as the left, but that's
255 : * something that's only ever dealt with on the leaf level, after
256 : * _bt_search has found an initial leaf page.)
257 : *
258 : * When nextkey = true: move right if the scan key is >= page's high key.
259 : * (Note that key.scantid cannot be set in this case.)
260 : *
261 : * The page could even have split more than once, so scan as far as
262 : * needed.
263 : *
264 : * We also have to move right if we followed a link that brought us to a
265 : * dead page.
266 : */
267 GIC 29747392 : cmpval = key->nextkey ? 0 : 1;
268 :
269 ECB : for (;;)
270 : {
271 GIC 29748068 : page = BufferGetPage(buf);
272 29748068 : TestForOldSnapshot(snapshot, rel, page);
273 CBC 29748068 : opaque = BTPageGetOpaque(page);
274 ECB :
275 CBC 29748068 : if (P_RIGHTMOST(opaque))
276 GIC 23650629 : break;
277 ECB :
278 : /*
279 : * Finish any incomplete splits we encounter along the way.
280 : */
281 GIC 6097439 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
282 UIC 0 : {
283 LBC 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
284 EUB :
285 : /* upgrade our lock if necessary */
286 UIC 0 : if (access == BT_READ)
287 : {
288 UBC 0 : _bt_unlockbuf(rel, buf);
289 UIC 0 : _bt_lockbuf(rel, buf, BT_WRITE);
290 EUB : }
291 :
292 UIC 0 : if (P_INCOMPLETE_SPLIT(opaque))
293 UNC 0 : _bt_finish_split(rel, heaprel, buf, stack);
294 EUB : else
295 UBC 0 : _bt_relbuf(rel, buf);
296 :
297 EUB : /* re-acquire the lock in the right mode, and re-check */
298 UNC 0 : buf = _bt_getbuf(rel, heaprel, blkno, access);
299 UIC 0 : continue;
300 EUB : }
301 :
302 GIC 6097439 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
303 : {
304 ECB : /* step right one page */
305 GIC 676 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
306 676 : continue;
307 ECB : }
308 : else
309 : break;
310 : }
311 :
312 GIC 29747392 : if (P_IGNORE(opaque))
313 UIC 0 : elog(ERROR, "fell off the end of index \"%s\"",
314 ECB : RelationGetRelationName(rel));
315 EUB :
316 GIC 29747392 : return buf;
317 : }
318 ECB :
319 : /*
320 : * _bt_binsrch() -- Do a binary search for a key on a particular page.
321 : *
322 : * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
323 : * key >= given scankey, or > scankey if nextkey is true. (NOTE: in
324 : * particular, this means it is possible to return a value 1 greater than the
325 : * number of keys on the page, if the scankey is > all keys on the page.)
326 : *
327 : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
328 : * of the last key < given scankey, or last key <= given scankey if nextkey
329 : * is true. (Since _bt_compare treats the first data key of such a page as
330 : * minus infinity, there will be at least one key < scankey, so the result
331 : * always points at one of the keys on the page.) This key indicates the
332 : * right place to descend to be sure we find all leaf keys >= given scankey
333 : * (or leaf keys > given scankey when nextkey is true).
334 : *
335 : * This procedure is not responsible for walking right, it just examines
336 : * the given page. _bt_binsrch() has no lock or refcount side effects
337 : * on the buffer.
338 : */
339 : static OffsetNumber
340 GIC 20943015 : _bt_binsrch(Relation rel,
341 : BTScanInsert key,
342 ECB : Buffer buf)
343 : {
344 : Page page;
345 : BTPageOpaque opaque;
346 : OffsetNumber low,
347 : high;
348 : int32 result,
349 : cmpval;
350 :
351 GIC 20943015 : page = BufferGetPage(buf);
352 20943015 : opaque = BTPageGetOpaque(page);
353 ECB :
354 : /* Requesting nextkey semantics while using scantid seems nonsensical */
355 GIC 20943015 : Assert(!key->nextkey || key->scantid == NULL);
356 : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
357 CBC 20943015 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
358 :
359 20943015 : low = P_FIRSTDATAKEY(opaque);
360 GIC 20943015 : high = PageGetMaxOffsetNumber(page);
361 ECB :
362 : /*
363 : * If there are no keys on the page, return the first available slot. Note
364 : * this covers two cases: the page is really empty (no keys), or it
365 : * contains only a high key. The latter case is possible after vacuuming.
366 : * This can never happen on an internal page, however, since they are
367 : * never empty (an internal page must have children).
368 : */
369 GIC 20943015 : if (unlikely(high < low))
370 2170 : return low;
371 ECB :
372 : /*
373 : * Binary search to find the first key on the page >= scan key, or first
374 : * key > scankey when nextkey is true.
375 : *
376 : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
377 : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
378 : *
379 : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
380 : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
381 : *
382 : * We can fall out when high == low.
383 : */
384 GIC 20940845 : high++; /* establish the loop invariant for high */
385 :
386 CBC 20940845 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
387 :
388 127472718 : while (high > low)
389 : {
390 106531873 : OffsetNumber mid = low + ((high - low) / 2);
391 :
392 ECB : /* We have low <= mid < high, so mid points at a real slot */
393 :
394 GIC 106531873 : result = _bt_compare(rel, key, page, mid);
395 :
396 CBC 106531873 : if (result >= cmpval)
397 GIC 70298342 : low = mid + 1;
398 ECB : else
399 CBC 36233531 : high = mid;
400 : }
401 ECB :
402 : /*
403 : * At this point we have high == low, but be careful: they could point
404 : * past the last slot on the page.
405 : *
406 : * On a leaf page, we always return the first key >= scan key (resp. >
407 : * scan key), which could be the last slot + 1.
408 : */
409 GIC 20940845 : if (P_ISLEAF(opaque))
410 8666182 : return low;
411 ECB :
412 : /*
413 : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
414 : * There must be one if _bt_compare() is playing by the rules.
415 : */
416 GIC 12274663 : Assert(low > P_FIRSTDATAKEY(opaque));
417 :
418 CBC 12274663 : return OffsetNumberPrev(low);
419 : }
420 ECB :
421 : /*
422 : *
423 : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
424 : *
425 : * Like _bt_binsrch(), but with support for caching the binary search
426 : * bounds. Only used during insertion, and only on the leaf page that it
427 : * looks like caller will insert tuple on. Exclusive-locked and pinned
428 : * leaf page is contained within insertstate.
429 : *
430 : * Caches the bounds fields in insertstate so that a subsequent call can
431 : * reuse the low and strict high bounds of original binary search. Callers
432 : * that use these fields directly must be prepared for the case where low
433 : * and/or stricthigh are not on the same page (one or both exceed maxoff
434 : * for the page). The case where there are no items on the page (high <
435 : * low) makes bounds invalid.
436 : *
437 : * Caller is responsible for invalidating bounds when it modifies the page
438 : * before calling here a second time, and for dealing with posting list
439 : * tuple matches (callers can use insertstate's postingoff field to
440 : * determine which existing heap TID will need to be replaced by a posting
441 : * list split).
442 : */
443 : OffsetNumber
444 GIC 12616278 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
445 : {
446 CBC 12616278 : BTScanInsert key = insertstate->itup_key;
447 : Page page;
448 ECB : BTPageOpaque opaque;
449 : OffsetNumber low,
450 : high,
451 : stricthigh;
452 : int32 result,
453 : cmpval;
454 :
455 GIC 12616278 : page = BufferGetPage(insertstate->buf);
456 12616278 : opaque = BTPageGetOpaque(page);
457 ECB :
458 CBC 12616278 : Assert(P_ISLEAF(opaque));
459 GIC 12616278 : Assert(!key->nextkey);
460 CBC 12616278 : Assert(insertstate->postingoff == 0);
461 ECB :
462 CBC 12616278 : if (!insertstate->bounds_valid)
463 : {
464 ECB : /* Start new binary search */
465 GIC 7384948 : low = P_FIRSTDATAKEY(opaque);
466 7384948 : high = PageGetMaxOffsetNumber(page);
467 ECB : }
468 : else
469 : {
470 : /* Restore result of previous binary search against same page */
471 GIC 5231330 : low = insertstate->low;
472 5231330 : high = insertstate->stricthigh;
473 ECB : }
474 :
475 : /* If there are no keys on the page, return the first available slot */
476 GIC 12616278 : if (unlikely(high < low))
477 : {
478 ECB : /* Caller can't reuse bounds */
479 GIC 19103 : insertstate->low = InvalidOffsetNumber;
480 19103 : insertstate->stricthigh = InvalidOffsetNumber;
481 CBC 19103 : insertstate->bounds_valid = false;
482 19103 : return low;
483 ECB : }
484 :
485 : /*
486 : * Binary search to find the first key on the page >= scan key. (nextkey
487 : * is always false when inserting).
488 : *
489 : * The loop invariant is: all slots before 'low' are < scan key, all slots
490 : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
491 : * maintained to save additional search effort for caller.
492 : *
493 : * We can fall out when high == low.
494 : */
495 GIC 12597175 : if (!insertstate->bounds_valid)
496 7365845 : high++; /* establish the loop invariant for high */
497 CBC 12597175 : stricthigh = high; /* high initially strictly higher */
498 ECB :
499 CBC 12597175 : cmpval = 1; /* !nextkey comparison value */
500 :
501 65283644 : while (high > low)
502 : {
503 52686469 : OffsetNumber mid = low + ((high - low) / 2);
504 :
505 ECB : /* We have low <= mid < high, so mid points at a real slot */
506 :
507 GIC 52686469 : result = _bt_compare(rel, key, page, mid);
508 :
509 CBC 52686469 : if (result >= cmpval)
510 GIC 40616979 : low = mid + 1;
511 ECB : else
512 : {
513 GIC 12069490 : high = mid;
514 12069490 : if (result != 0)
515 CBC 11435404 : stricthigh = high;
516 ECB : }
517 :
518 : /*
519 : * If tuple at offset located by binary search is a posting list whose
520 : * TID range overlaps with caller's scantid, perform posting list
521 : * binary search to set postingoff for caller. Caller must split the
522 : * posting list when postingoff is set. This should happen
523 : * infrequently.
524 : */
525 GIC 52686469 : if (unlikely(result == 0 && key->scantid != NULL))
526 : {
527 ECB : /*
528 : * postingoff should never be set more than once per leaf page
529 : * binary search. That would mean that there are duplicate table
530 : * TIDs in the index, which is never okay. Check for that here.
531 : */
532 GIC 207807 : if (insertstate->postingoff != 0)
533 UIC 0 : ereport(ERROR,
534 ECB : (errcode(ERRCODE_INDEX_CORRUPTED),
535 EUB : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
536 : ItemPointerGetBlockNumber(key->scantid),
537 : ItemPointerGetOffsetNumber(key->scantid),
538 : low, stricthigh,
539 : BufferGetBlockNumber(insertstate->buf),
540 : RelationGetRelationName(rel))));
541 :
542 GIC 207807 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
543 : }
544 ECB : }
545 :
546 : /*
547 : * On a leaf page, a binary search always returns the first key >= scan
548 : * key (at least in !nextkey case), which could be the last slot + 1. This
549 : * is also the lower bound of cached search.
550 : *
551 : * stricthigh may also be the last slot + 1, which prevents caller from
552 : * using bounds directly, but is still useful to us if we're called a
553 : * second time with cached bounds (cached low will be < stricthigh when
554 : * that happens).
555 : */
556 GIC 12597175 : insertstate->low = low;
557 12597175 : insertstate->stricthigh = stricthigh;
558 CBC 12597175 : insertstate->bounds_valid = true;
559 ECB :
560 CBC 12597175 : return low;
561 : }
562 ECB :
563 : /*----------
564 : * _bt_binsrch_posting() -- posting list binary search.
565 : *
566 : * Helper routine for _bt_binsrch_insert().
567 : *
568 : * Returns offset into posting list where caller's scantid belongs.
569 : *----------
570 : */
571 : static int
572 GIC 207807 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
573 : {
574 ECB : IndexTuple itup;
575 : ItemId itemid;
576 : int low,
577 : high,
578 : mid,
579 : res;
580 :
581 : /*
582 : * If this isn't a posting tuple, then the index must be corrupt (if it is
583 : * an ordinary non-pivot tuple then there must be an existing tuple with a
584 : * heap TID that equals inserter's new heap TID/scantid). Defensively
585 : * check that tuple is a posting list tuple whose posting list range
586 : * includes caller's scantid.
587 : *
588 : * (This is also needed because contrib/amcheck's rootdescend option needs
589 : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
590 : */
591 GIC 207807 : itemid = PageGetItemId(page, offnum);
592 207807 : itup = (IndexTuple) PageGetItem(page, itemid);
593 CBC 207807 : if (!BTreeTupleIsPosting(itup))
594 200041 : return 0;
595 ECB :
596 CBC 7766 : Assert(key->heapkeyspace && key->allequalimage);
597 :
598 ECB : /*
599 : * In the event that posting list tuple has LP_DEAD bit set, indicate this
600 : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
601 : * second call to _bt_binsrch_insert() can take place when its caller has
602 : * removed the dead item.
603 : */
604 GIC 7766 : if (ItemIdIsDead(itemid))
605 UIC 0 : return -1;
606 ECB :
607 EUB : /* "high" is past end of posting list for loop invariant */
608 GIC 7766 : low = 0;
609 7766 : high = BTreeTupleGetNPosting(itup);
610 CBC 7766 : Assert(high >= 2);
611 ECB :
612 CBC 62476 : while (high > low)
613 : {
614 54710 : mid = low + ((high - low) / 2);
615 GIC 54710 : res = ItemPointerCompare(key->scantid,
616 ECB : BTreeTupleGetPostingN(itup, mid));
617 :
618 GIC 54710 : if (res > 0)
619 28848 : low = mid + 1;
620 CBC 25862 : else if (res < 0)
621 25862 : high = mid;
622 ECB : else
623 LBC 0 : return mid;
624 : }
625 EUB :
626 : /* Exact match not found */
627 GIC 7766 : return low;
628 : }
629 ECB :
630 : /*----------
631 : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
632 : *
633 : * page/offnum: location of btree item to be compared to.
634 : *
635 : * This routine returns:
636 : * <0 if scankey < tuple at offnum;
637 : * 0 if scankey == tuple at offnum;
638 : * >0 if scankey > tuple at offnum.
639 : *
640 : * NULLs in the keys are treated as sortable values. Therefore
641 : * "equality" does not necessarily mean that the item should be returned
642 : * to the caller as a matching key. Similarly, an insertion scankey
643 : * with its scantid set is treated as equal to a posting tuple whose TID
644 : * range overlaps with their scantid. There generally won't be a
645 : * matching TID in the posting tuple, which caller must handle
646 : * themselves (e.g., by splitting the posting list tuple).
647 : *
648 : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
649 : * "minus infinity": this routine will always claim it is less than the
650 : * scankey. The actual key value stored is explicitly truncated to 0
651 : * attributes (explicitly minus infinity) with version 3+ indexes, but
652 : * that isn't relied upon. This allows us to implement the Lehman and
653 : * Yao convention that the first down-link pointer is before the first
654 : * key. See backend/access/nbtree/README for details.
655 : *----------
656 : */
657 : int32
658 GIC 177440905 : _bt_compare(Relation rel,
659 : BTScanInsert key,
660 ECB : Page page,
661 : OffsetNumber offnum)
662 : {
663 GIC 177440905 : TupleDesc itupdesc = RelationGetDescr(rel);
664 177440905 : BTPageOpaque opaque = BTPageGetOpaque(page);
665 ECB : IndexTuple itup;
666 : ItemPointer heapTid;
667 : ScanKey scankey;
668 : int ncmpkey;
669 : int ntupatts;
670 : int32 result;
671 :
672 GIC 177440905 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
673 177440905 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
674 CBC 177440905 : Assert(key->heapkeyspace || key->scantid == NULL);
675 ECB :
676 : /*
677 : * Force result ">" if target item is first data item on an internal page
678 : * --- see NOTE above.
679 : */
680 GIC 177440905 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
681 1889977 : return 1;
682 ECB :
683 CBC 175550928 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
684 GIC 175550928 : ntupatts = BTreeTupleGetNAtts(itup, rel);
685 ECB :
686 : /*
687 : * The scan key is set up with the attribute number associated with each
688 : * term in the key. It is important that, if the index is multi-key, the
689 : * scan contain the first k key attributes, and that they be in order. If
690 : * you think about how multi-key ordering works, you'll understand why
691 : * this is.
692 : *
693 : * We don't test for violation of this condition here, however. The
694 : * initial setup for the index scan had better have gotten it right (see
695 : * _bt_first).
696 : */
697 :
698 GIC 175550928 : ncmpkey = Min(ntupatts, key->keysz);
699 175550928 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
700 CBC 175550928 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
701 175550928 : scankey = key->scankeys;
702 220720244 : for (int i = 1; i <= ncmpkey; i++)
703 ECB : {
704 : Datum datum;
705 : bool isNull;
706 :
707 GIC 207864745 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
708 :
709 CBC 207864745 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
710 : {
711 170918 : if (isNull)
712 GIC 83 : result = 0; /* NULL "=" NULL */
713 CBC 170835 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
714 132 : result = -1; /* NULL "<" NOT_NULL */
715 ECB : else
716 CBC 170703 : result = 1; /* NULL ">" NOT_NULL */
717 : }
718 207693827 : else if (isNull) /* key is NOT_NULL and item is NULL */
719 : {
720 132 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
721 UIC 0 : result = 1; /* NOT_NULL ">" NULL */
722 ECB : else
723 GBC 132 : result = -1; /* NOT_NULL "<" NULL */
724 : }
725 ECB : else
726 : {
727 : /*
728 : * The sk_func needs to be passed the index value as left arg and
729 : * the sk_argument as right arg (they might be of different
730 : * types). Since it is convenient for callers to think of
731 : * _bt_compare as comparing the scankey to the index item, we have
732 : * to flip the sign of the comparison result. (Unless it's a DESC
733 : * column, in which case we *don't* flip the sign.)
734 : */
735 GIC 207693695 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
736 : scankey->sk_collation,
737 ECB : datum,
738 : scankey->sk_argument));
739 :
740 GIC 207693695 : if (!(scankey->sk_flags & SK_BT_DESC))
741 207693692 : INVERT_COMPARE_RESULT(result);
742 ECB : }
743 :
744 : /* if the keys are unequal, return the difference */
745 GIC 207864745 : if (result != 0)
746 162695429 : return result;
747 ECB :
748 CBC 45169316 : scankey++;
749 : }
750 ECB :
751 : /*
752 : * All non-truncated attributes (other than heap TID) were found to be
753 : * equal. Treat truncated attributes as minus infinity when scankey has a
754 : * key attribute value that would otherwise be compared directly.
755 : *
756 : * Note: it doesn't matter if ntupatts includes non-key attributes;
757 : * scankey won't, so explicitly excluding non-key attributes isn't
758 : * necessary.
759 : */
760 GIC 12855499 : if (key->keysz > ntupatts)
761 125172 : return 1;
762 ECB :
763 : /*
764 : * Use the heap TID attribute and scantid to try to break the tie. The
765 : * rules are the same as any other key attribute -- only the
766 : * representation differs.
767 : */
768 GIC 12730327 : heapTid = BTreeTupleGetHeapTID(itup);
769 12730327 : if (key->scantid == NULL)
770 ECB : {
771 : /*
772 : * Most searches have a scankey that is considered greater than a
773 : * truncated pivot tuple if and when the scankey has equal values for
774 : * attributes up to and including the least significant untruncated
775 : * attribute in tuple.
776 : *
777 : * For example, if an index has the minimum two attributes (single
778 : * user key attribute, plus heap TID attribute), and a page's high key
779 : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
780 : * will not descend to the page to the left. The search will descend
781 : * right instead. The truncated attribute in pivot tuple means that
782 : * all non-pivot tuples on the page to the left are strictly < 'foo',
783 : * so it isn't necessary to descend left. In other words, search
784 : * doesn't have to descend left because it isn't interested in a match
785 : * that has a heap TID value of -inf.
786 : *
787 : * However, some searches (pivotsearch searches) actually require that
788 : * we descend left when this happens. -inf is treated as a possible
789 : * match for omitted scankey attribute(s). This is needed by page
790 : * deletion, which must re-find leaf pages that are targets for
791 : * deletion using their high keys.
792 : *
793 : * Note: the heap TID part of the test ensures that scankey is being
794 : * compared to a pivot tuple with one or more truncated key
795 : * attributes.
796 : *
797 : * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
798 : * left here, since they have no heap TID attribute (and cannot have
799 : * any -inf key values in any case, since truncation can only remove
800 : * non-key attributes). !heapkeyspace searches must always be
801 : * prepared to deal with matches on both sides of the pivot once the
802 : * leaf level is reached.
803 : */
804 GIC 9213987 : if (key->heapkeyspace && !key->pivotsearch &&
805 9207253 : key->keysz == ntupatts && heapTid == NULL)
806 CBC 7404 : return 1;
807 ECB :
808 : /* All provided scankey arguments found to be equal */
809 GIC 9206583 : return 0;
810 : }
811 ECB :
812 : /*
813 : * Treat truncated heap TID as minus infinity, since scankey has a key
814 : * attribute value (scantid) that would otherwise be compared directly
815 : */
816 GIC 3516340 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
817 3516340 : if (heapTid == NULL)
818 CBC 1978 : return 1;
819 ECB :
820 : /*
821 : * Scankey must be treated as equal to a posting list tuple if its scantid
822 : * value falls within the range of the posting list. In all other cases
823 : * there can only be a single heap TID value, which is compared directly
824 : * with scantid.
825 : */
826 GIC 3514362 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
827 3514362 : result = ItemPointerCompare(key->scantid, heapTid);
828 CBC 3513995 : if (result <= 0 || !BTreeTupleIsPosting(itup))
829 3394046 : return result;
830 ECB : else
831 : {
832 GIC 119949 : result = ItemPointerCompare(key->scantid,
833 : BTreeTupleGetMaxHeapTID(itup));
834 CBC 119949 : if (result > 0)
835 GIC 112183 : return 1;
836 ECB : }
837 :
838 GIC 7766 : return 0;
839 : }
840 ECB :
841 : /*
842 : * _bt_first() -- Find the first item in a scan.
843 : *
844 : * We need to be clever about the direction of scan, the search
845 : * conditions, and the tree ordering. We find the first item (or,
846 : * if backwards scan, the last item) in the tree that satisfies the
847 : * qualifications in the scan key. On success exit, the page containing
848 : * the current index tuple is pinned but not locked, and data about
849 : * the matching tuple(s) on the page has been loaded into so->currPos.
850 : * scan->xs_ctup.t_self is set to the heap TID of the current tuple,
851 : * and if requested, scan->xs_itup points to a copy of the index tuple.
852 : *
853 : * If there are no matching items in the index, we return false, with no
854 : * pins or locks held.
855 : *
856 : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
857 : * are both search-type scankeys (see nbtree/README for more about this).
858 : * Within this routine, we build a temporary insertion-type scankey to use
859 : * in locating the scan start position.
860 : */
861 : bool
862 GIC 9087979 : _bt_first(IndexScanDesc scan, ScanDirection dir)
863 : {
864 CBC 9087979 : Relation rel = scan->indexRelation;
865 GNC 9087979 : Relation heaprel = scan->heapRelation;
866 GIC 9087979 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
867 ECB : Buffer buf;
868 : BTStack stack;
869 : OffsetNumber offnum;
870 : StrategyNumber strat;
871 : bool nextkey;
872 : bool goback;
873 : BTScanInsertData inskey;
874 : ScanKey startKeys[INDEX_MAX_KEYS];
875 : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
876 GIC 9087979 : int keysCount = 0;
877 : int i;
878 : bool status;
879 ECB : StrategyNumber strat_total;
880 : BTScanPosItem *currItem;
881 : BlockNumber blkno;
882 :
883 GIC 9087979 : Assert(!BTScanPosIsValid(so->currPos));
884 :
885 9087979 : pgstat_count_index_scan(rel);
886 ECB :
887 : /*
888 : * Examine the scan keys and eliminate any redundant keys; also mark the
889 : * keys that must be matched to continue the scan.
890 : */
891 GIC 9087979 : _bt_preprocess_keys(scan);
892 :
893 : /*
894 ECB : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
895 : * never be satisfied (eg, x == 1 AND x > 2).
896 : */
897 GIC 9087979 : if (!so->qual_ok)
898 : {
899 : /* Notify any other workers that we're done with this scan key. */
900 CBC 1286 : _bt_parallel_done(scan);
901 GIC 1286 : return false;
902 : }
903 ECB :
904 : /*
905 : * For parallel scans, get the starting page from shared state. If the
906 : * scan has not started, proceed to find out first leaf page in the usual
907 : * way while keeping other participating processes waiting. If the scan
908 : * has already begun, use the page number from the shared structure.
909 : */
910 GIC 9086693 : if (scan->parallel_scan != NULL)
911 : {
912 184 : status = _bt_parallel_seize(scan, &blkno);
913 CBC 184 : if (!status)
914 GIC 140 : return false;
915 CBC 44 : else if (blkno == P_NONE)
916 ECB : {
917 CBC 2 : _bt_parallel_done(scan);
918 2 : return false;
919 : }
920 42 : else if (blkno != InvalidBlockNumber)
921 ECB : {
922 GIC 4 : if (!_bt_parallel_readpage(scan, blkno, dir))
923 LBC 0 : return false;
924 GIC 4 : goto readcomplete;
925 ECB : }
926 EUB : }
927 ECB :
928 : /*----------
929 : * Examine the scan keys to discover where we need to start the scan.
930 : *
931 : * We want to identify the keys that can be used as starting boundaries;
932 : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
933 : * a backwards scan. We can use keys for multiple attributes so long as
934 : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
935 : * a > or < boundary or find an attribute with no boundary (which can be
936 : * thought of as the same as "> -infinity"), we can't use keys for any
937 : * attributes to its right, because it would break our simplistic notion
938 : * of what initial positioning strategy to use.
939 : *
940 : * When the scan keys include cross-type operators, _bt_preprocess_keys
941 : * may not be able to eliminate redundant keys; in such cases we will
942 : * arbitrarily pick a usable one for each attribute. This is correct
943 : * but possibly not optimal behavior. (For example, with keys like
944 : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
945 : * x=5 would be more efficient.) Since the situation only arises given
946 : * a poorly-worded query plus an incomplete opfamily, live with it.
947 : *
948 : * When both equality and inequality keys appear for a single attribute
949 : * (again, only possible when cross-type operators appear), we *must*
950 : * select one of the equality keys for the starting point, because
951 : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
952 : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
953 : * start at x=4, we will fail and stop before reaching x=10. If multiple
954 : * equality quals survive preprocessing, however, it doesn't matter which
955 : * one we use --- by definition, they are either redundant or
956 : * contradictory.
957 : *
958 : * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
959 : * If the index stores nulls at the end of the index we'll be starting
960 : * from, and we have no boundary key for the column (which means the key
961 : * we deduced NOT NULL from is an inequality key that constrains the other
962 : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
963 : * use as a boundary key. If we didn't do this, we might find ourselves
964 : * traversing a lot of null entries at the start of the scan.
965 : *
966 : * In this loop, row-comparison keys are treated the same as keys on their
967 : * first (leftmost) columns. We'll add on lower-order columns of the row
968 : * comparison below, if possible.
969 : *
970 : * The selected scan keys (at most one per index column) are remembered by
971 : * storing their addresses into the local startKeys[] array.
972 : *----------
973 : */
974 GIC 9086547 : strat_total = BTEqualStrategyNumber;
975 9086547 : if (so->numberOfKeys > 0)
976 : {
977 ECB : AttrNumber curattr;
978 : ScanKey chosen;
979 : ScanKey impliesNN;
980 : ScanKey cur;
981 :
982 : /*
983 : * chosen is the so-far-chosen key for the current attribute, if any.
984 : * We don't cast the decision in stone until we reach keys for the
985 : * next attribute.
986 : */
987 GIC 9081689 : curattr = 1;
988 9081689 : chosen = NULL;
989 : /* Also remember any scankey that implies a NOT NULL constraint */
990 CBC 9081689 : impliesNN = NULL;
991 ECB :
992 : /*
993 : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
994 : * pass to handle after-last-key processing. Actual exit from the
995 : * loop is at one of the "break" statements below.
996 : */
997 GIC 24989949 : for (cur = so->keyData, i = 0;; cur++, i++)
998 : {
999 24989949 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1000 ECB : {
1001 : /*
1002 : * Done looking at keys for curattr. If we didn't find a
1003 : * usable boundary key, see if we can deduce a NOT NULL key.
1004 : */
1005 GIC 15934530 : if (chosen == NULL && impliesNN != NULL &&
1006 25904 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1007 : ScanDirectionIsForward(dir) :
1008 ECB : ScanDirectionIsBackward(dir)))
1009 : {
1010 : /* Yes, so build the key in notnullkeys[keysCount] */
1011 GIC 3 : chosen = ¬nullkeys[keysCount];
1012 3 : ScanKeyEntryInitialize(chosen,
1013 : (SK_SEARCHNOTNULL | SK_ISNULL |
1014 CBC 3 : (impliesNN->sk_flags &
1015 ECB : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1016 : curattr,
1017 CBC 3 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1018 : BTGreaterStrategyNumber :
1019 : BTLessStrategyNumber),
1020 ECB : InvalidOid,
1021 : InvalidOid,
1022 : InvalidOid,
1023 : (Datum) 0);
1024 : }
1025 :
1026 : /*
1027 : * If we still didn't find a usable boundary key, quit; else
1028 : * save the boundary key pointer in startKeys.
1029 : */
1030 GIC 15908626 : if (chosen == NULL)
1031 27175 : break;
1032 15881451 : startKeys[keysCount++] = chosen;
1033 ECB :
1034 : /*
1035 : * Adjust strat_total, and quit if we have stored a > or <
1036 : * key.
1037 : */
1038 GIC 15881451 : strat = chosen->sk_strategy;
1039 15881451 : if (strat != BTEqualStrategyNumber)
1040 : {
1041 CBC 709058 : strat_total = strat;
1042 709058 : if (strat == BTGreaterStrategyNumber ||
1043 : strat == BTLessStrategyNumber)
1044 ECB : break;
1045 : }
1046 :
1047 : /*
1048 : * Done if that was the last attribute, or if next key is not
1049 : * in sequence (implying no boundary key is available for the
1050 : * next attribute).
1051 : */
1052 GIC 15174487 : if (i >= so->numberOfKeys ||
1053 6827667 : cur->sk_attno != curattr + 1)
1054 : break;
1055 ECB :
1056 : /*
1057 : * Reset for next attr.
1058 : */
1059 GIC 6826937 : curattr = cur->sk_attno;
1060 6826937 : chosen = NULL;
1061 6826937 : impliesNN = NULL;
1062 ECB : }
1063 :
1064 : /*
1065 : * Can we use this key as a starting boundary for this attr?
1066 : *
1067 : * If not, does it imply a NOT NULL constraint? (Because
1068 : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1069 : * *any* inequality key works for that; we need not test.)
1070 : */
1071 GIC 15908260 : switch (cur->sk_strategy)
1072 : {
1073 48301 : case BTLessStrategyNumber:
1074 ECB : case BTLessEqualStrategyNumber:
1075 GIC 48301 : if (chosen == NULL)
1076 ECB : {
1077 GIC 47399 : if (ScanDirectionIsBackward(dir))
1078 CBC 21504 : chosen = cur;
1079 : else
1080 25895 : impliesNN = cur;
1081 ECB : }
1082 GIC 48301 : break;
1083 CBC 15172393 : case BTEqualStrategyNumber:
1084 : /* override any non-equality choice */
1085 15172393 : chosen = cur;
1086 15172393 : break;
1087 GIC 687566 : case BTGreaterEqualStrategyNumber:
1088 ECB : case BTGreaterStrategyNumber:
1089 CBC 687566 : if (chosen == NULL)
1090 ECB : {
1091 GIC 687566 : if (ScanDirectionIsForward(dir))
1092 CBC 687551 : chosen = cur;
1093 : else
1094 15 : impliesNN = cur;
1095 ECB : }
1096 GIC 687566 : break;
1097 ECB : }
1098 : }
1099 : }
1100 :
1101 : /*
1102 : * If we found no usable boundary keys, we have to start from one end of
1103 : * the tree. Walk down that edge to the first or last key, and scan from
1104 : * there.
1105 : */
1106 GIC 9086547 : if (keysCount == 0)
1107 : {
1108 : bool match;
1109 ECB :
1110 GIC 31957 : match = _bt_endpoint(scan, dir);
1111 :
1112 31957 : if (!match)
1113 ECB : {
1114 : /* No match, so mark (parallel) scan finished */
1115 CBC 3978 : _bt_parallel_done(scan);
1116 : }
1117 :
1118 31957 : return match;
1119 : }
1120 :
1121 ECB : /*
1122 : * We want to start the scan somewhere within the index. Set up an
1123 : * insertion scankey we can use to search for the boundary point we
1124 : * identified above. The insertion scankey is built using the keys
1125 : * identified by startKeys[]. (Remaining insertion scankey fields are
1126 : * initialized after initial-positioning strategy is finalized.)
1127 : */
1128 GIC 9054590 : Assert(keysCount <= INDEX_MAX_KEYS);
1129 24936029 : for (i = 0; i < keysCount; i++)
1130 : {
1131 CBC 15881451 : ScanKey cur = startKeys[i];
1132 ECB :
1133 GIC 15881451 : Assert(cur->sk_attno == i + 1);
1134 ECB :
1135 GIC 15881451 : if (cur->sk_flags & SK_ROW_HEADER)
1136 ECB : {
1137 : /*
1138 : * Row comparison header: look to the first row member instead.
1139 : *
1140 : * The member scankeys are already in insertion format (ie, they
1141 : * have sk_func = 3-way-comparison function), but we have to watch
1142 : * out for nulls, which _bt_preprocess_keys didn't check. A null
1143 : * in the first row member makes the condition unmatchable, just
1144 : * like qual_ok = false.
1145 : */
1146 GIC 12 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1147 :
1148 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1149 CBC 12 : if (subkey->sk_flags & SK_ISNULL)
1150 : {
1151 LBC 0 : _bt_parallel_done(scan);
1152 0 : return false;
1153 : }
1154 GBC 12 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1155 EUB :
1156 : /*
1157 ECB : * If the row comparison is the last positioning key we accepted,
1158 : * try to add additional keys from the lower-order row members.
1159 : * (If we accepted independent conditions on additional index
1160 : * columns, we use those instead --- doesn't seem worth trying to
1161 : * determine which is more restrictive.) Note that this is OK
1162 : * even if the row comparison is of ">" or "<" type, because the
1163 : * condition applied to all but the last row member is effectively
1164 : * ">=" or "<=", and so the extra keys don't break the positioning
1165 : * scheme. But, by the same token, if we aren't able to use all
1166 : * the row members, then the part of the row comparison that we
1167 : * did use has to be treated as just a ">=" or "<=" condition, and
1168 : * so we'd better adjust strat_total accordingly.
1169 : */
1170 GIC 12 : if (i == keysCount - 1)
1171 : {
1172 12 : bool used_all_subkeys = false;
1173 ECB :
1174 GIC 12 : Assert(!(subkey->sk_flags & SK_ROW_END));
1175 ECB : for (;;)
1176 : {
1177 CBC 12 : subkey++;
1178 GIC 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1179 12 : if (subkey->sk_attno != keysCount + 1)
1180 LBC 0 : break; /* out-of-sequence, can't use it */
1181 CBC 12 : if (subkey->sk_strategy != cur->sk_strategy)
1182 LBC 0 : break; /* wrong direction, can't use it */
1183 GBC 12 : if (subkey->sk_flags & SK_ISNULL)
1184 LBC 0 : break; /* can't use null keys */
1185 GBC 12 : Assert(keysCount < INDEX_MAX_KEYS);
1186 CBC 12 : memcpy(inskey.scankeys + keysCount, subkey,
1187 EUB : sizeof(ScanKeyData));
1188 CBC 12 : keysCount++;
1189 12 : if (subkey->sk_flags & SK_ROW_END)
1190 : {
1191 12 : used_all_subkeys = true;
1192 12 : break;
1193 : }
1194 ECB : }
1195 CBC 12 : if (!used_all_subkeys)
1196 : {
1197 UIC 0 : switch (strat_total)
1198 ECB : {
1199 UIC 0 : case BTLessStrategyNumber:
1200 UBC 0 : strat_total = BTLessEqualStrategyNumber;
1201 UIC 0 : break;
1202 UBC 0 : case BTGreaterStrategyNumber:
1203 0 : strat_total = BTGreaterEqualStrategyNumber;
1204 0 : break;
1205 EUB : }
1206 : }
1207 GBC 12 : break; /* done with outer loop */
1208 : }
1209 : }
1210 ECB : else
1211 : {
1212 : /*
1213 : * Ordinary comparison key. Transform the search-style scan key
1214 : * to an insertion scan key by replacing the sk_func with the
1215 : * appropriate btree comparison function.
1216 : *
1217 : * If scankey operator is not a cross-type comparison, we can use
1218 : * the cached comparison function; otherwise gotta look it up in
1219 : * the catalogs. (That can't lead to infinite recursion, since no
1220 : * indexscan initiated by syscache lookup will use cross-data-type
1221 : * operators.)
1222 : *
1223 : * We support the convention that sk_subtype == InvalidOid means
1224 : * the opclass input type; this is a hack to simplify life for
1225 : * ScanKeyInit().
1226 : */
1227 GIC 15881439 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
1228 15621462 : cur->sk_subtype == InvalidOid)
1229 15875108 : {
1230 ECB : FmgrInfo *procinfo;
1231 :
1232 CBC 15875108 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1233 GIC 15875108 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1234 : cur->sk_flags,
1235 CBC 15875108 : cur->sk_attno,
1236 ECB : InvalidStrategy,
1237 : cur->sk_subtype,
1238 : cur->sk_collation,
1239 : procinfo,
1240 : cur->sk_argument);
1241 : }
1242 : else
1243 : {
1244 : RegProcedure cmp_proc;
1245 :
1246 GIC 6331 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1247 6331 : rel->rd_opcintype[i],
1248 : cur->sk_subtype,
1249 ECB : BTORDER_PROC);
1250 CBC 6331 : if (!RegProcedureIsValid(cmp_proc))
1251 UIC 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1252 : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1253 ECB : cur->sk_attno, RelationGetRelationName(rel));
1254 GBC 6331 : ScanKeyEntryInitialize(inskey.scankeys + i,
1255 : cur->sk_flags,
1256 GIC 6331 : cur->sk_attno,
1257 ECB : InvalidStrategy,
1258 : cur->sk_subtype,
1259 : cur->sk_collation,
1260 : cmp_proc,
1261 : cur->sk_argument);
1262 : }
1263 : }
1264 : }
1265 :
1266 : /*----------
1267 : * Examine the selected initial-positioning strategy to determine exactly
1268 : * where we need to start the scan, and set flag variables to control the
1269 : * code below.
1270 : *
1271 : * If nextkey = false, _bt_search and _bt_binsrch will locate the first
1272 : * item >= scan key. If nextkey = true, they will locate the first
1273 : * item > scan key.
1274 : *
1275 : * If goback = true, we will then step back one item, while if
1276 : * goback = false, we will start the scan on the located item.
1277 : *----------
1278 : */
1279 GIC 9054590 : switch (strat_total)
1280 : {
1281 21507 : case BTLessStrategyNumber:
1282 ECB :
1283 : /*
1284 : * Find first item >= scankey, then back up one to arrive at last
1285 : * item < scankey. (Note: this positioning strategy is only used
1286 : * for a backward scan, so that is always the correct starting
1287 : * position.)
1288 : */
1289 GIC 21507 : nextkey = false;
1290 21507 : goback = true;
1291 21507 : break;
1292 ECB :
1293 LBC 0 : case BTLessEqualStrategyNumber:
1294 ECB :
1295 : /*
1296 EUB : * Find first item > scankey, then back up one to arrive at last
1297 : * item <= scankey. (Note: this positioning strategy is only used
1298 : * for a backward scan, so that is always the correct starting
1299 : * position.)
1300 : */
1301 UIC 0 : nextkey = true;
1302 0 : goback = true;
1303 0 : break;
1304 EUB :
1305 GBC 8345532 : case BTEqualStrategyNumber:
1306 EUB :
1307 : /*
1308 ECB : * If a backward scan was specified, need to start with last equal
1309 : * item not first one.
1310 : */
1311 GIC 8345532 : if (ScanDirectionIsBackward(dir))
1312 : {
1313 : /*
1314 ECB : * This is the same as the <= strategy. We will check at the
1315 : * end whether the found item is actually =.
1316 : */
1317 GIC 64 : nextkey = true;
1318 64 : goback = true;
1319 : }
1320 ECB : else
1321 : {
1322 : /*
1323 : * This is the same as the >= strategy. We will check at the
1324 : * end whether the found item is actually =.
1325 : */
1326 GIC 8345468 : nextkey = false;
1327 8345468 : goback = false;
1328 : }
1329 CBC 8345532 : break;
1330 ECB :
1331 GIC 2094 : case BTGreaterEqualStrategyNumber:
1332 ECB :
1333 : /*
1334 : * Find first item >= scankey. (This is only used for forward
1335 : * scans.)
1336 : */
1337 GIC 2094 : nextkey = false;
1338 2094 : goback = false;
1339 2094 : break;
1340 ECB :
1341 CBC 685457 : case BTGreaterStrategyNumber:
1342 ECB :
1343 : /*
1344 : * Find first item > scankey. (This is only used for forward
1345 : * scans.)
1346 : */
1347 GIC 685457 : nextkey = true;
1348 685457 : goback = false;
1349 685457 : break;
1350 ECB :
1351 LBC 0 : default:
1352 ECB : /* can't get here, but keep compiler quiet */
1353 UIC 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1354 EUB : return false;
1355 : }
1356 :
1357 : /* Initialize remaining insertion scan key fields */
1358 GNC 9054590 : _bt_metaversion(rel, heaprel, &inskey.heapkeyspace, &inskey.allequalimage);
1359 GIC 9054590 : inskey.anynullkeys = false; /* unused */
1360 9054590 : inskey.nextkey = nextkey;
1361 CBC 9054590 : inskey.pivotsearch = false;
1362 9054590 : inskey.scantid = NULL;
1363 9054590 : inskey.keysz = keysCount;
1364 ECB :
1365 : /*
1366 : * Use the manufactured insertion scan key to descend the tree and
1367 : * position ourselves on the target leaf page.
1368 : */
1369 GNC 9054590 : stack = _bt_search(rel, heaprel, &inskey, &buf, BT_READ, scan->xs_snapshot);
1370 :
1371 : /* don't need to keep the stack around... */
1372 CBC 9054590 : _bt_freestack(stack);
1373 :
1374 GIC 9054590 : if (!BufferIsValid(buf))
1375 ECB : {
1376 : /*
1377 : * We only get here if the index is completely empty. Lock relation
1378 : * because nothing finer to lock exists.
1379 : */
1380 GIC 385759 : PredicateLockRelation(rel, scan->xs_snapshot);
1381 :
1382 : /*
1383 ECB : * mark parallel scan as done, so that all the workers can finish
1384 : * their scan
1385 : */
1386 GIC 385759 : _bt_parallel_done(scan);
1387 385759 : BTScanPosInvalidate(so->currPos);
1388 :
1389 CBC 385759 : return false;
1390 ECB : }
1391 : else
1392 CBC 8668831 : PredicateLockPage(rel, BufferGetBlockNumber(buf),
1393 : scan->xs_snapshot);
1394 :
1395 8668831 : _bt_initialize_more_data(so, dir);
1396 :
1397 : /* position to the precise item on the page */
1398 8668831 : offnum = _bt_binsrch(rel, &inskey, buf);
1399 :
1400 : /*
1401 ECB : * If nextkey = false, we are positioned at the first item >= scan key, or
1402 : * possibly at the end of a page on which all the existing items are less
1403 : * than the scan key and we know that everything on later pages is greater
1404 : * than or equal to scan key.
1405 : *
1406 : * If nextkey = true, we are positioned at the first item > scan key, or
1407 : * possibly at the end of a page on which all the existing items are less
1408 : * than or equal to the scan key and we know that everything on later
1409 : * pages is greater than scan key.
1410 : *
1411 : * The actually desired starting point is either this item or the prior
1412 : * one, or in the end-of-page case it's the first item on the next page or
1413 : * the last item on this page. Adjust the starting offset if needed. (If
1414 : * this results in an offset before the first item or after the last one,
1415 : * _bt_readpage will report no items found, and then we'll step to the
1416 : * next page as needed.)
1417 : */
1418 GIC 8668831 : if (goback)
1419 21567 : offnum = OffsetNumberPrev(offnum);
1420 :
1421 ECB : /* remember which buffer we have pinned, if any */
1422 CBC 8668831 : Assert(!BTScanPosIsValid(so->currPos));
1423 GIC 8668831 : so->currPos.buf = buf;
1424 :
1425 ECB : /*
1426 : * Now load data from the first page of the scan.
1427 : */
1428 GIC 8668831 : if (!_bt_readpage(scan, dir, offnum))
1429 : {
1430 : /*
1431 ECB : * There's no actually-matching data on this page. Try to advance to
1432 : * the next page. Return false if there's no matching data at all.
1433 : */
1434 GIC 3688427 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
1435 3688427 : if (!_bt_steppage(scan, dir))
1436 3688414 : return false;
1437 ECB : }
1438 : else
1439 : {
1440 : /* Drop the lock, and maybe the pin, on the current page */
1441 GIC 4980404 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1442 : }
1443 :
1444 CBC 4980421 : readcomplete:
1445 : /* OK, itemIndex says what to return */
1446 GIC 4980421 : currItem = &so->currPos.items[so->currPos.itemIndex];
1447 CBC 4980421 : scan->xs_heaptid = currItem->heapTid;
1448 GIC 4980421 : if (scan->xs_want_itup)
1449 CBC 64578 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1450 ECB :
1451 CBC 4980421 : return true;
1452 ECB : }
1453 :
1454 : /*
1455 : * _bt_next() -- Get the next item in a scan.
1456 : *
1457 : * On entry, so->currPos describes the current page, which may be pinned
1458 : * but is not locked, and so->currPos.itemIndex identifies which item was
1459 : * previously returned.
1460 : *
1461 : * On successful exit, scan->xs_ctup.t_self is set to the TID of the
1462 : * next heap tuple, and if requested, scan->xs_itup points to a copy of
1463 : * the index tuple. so->currPos is updated as needed.
1464 : *
1465 : * On failure exit (no more tuples), we release pin and set
1466 : * so->currPos.buf to InvalidBuffer.
1467 : */
1468 : bool
1469 GIC 11957219 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1470 : {
1471 11957219 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1472 ECB : BTScanPosItem *currItem;
1473 :
1474 : /*
1475 : * Advance to next tuple on current page; or if there's no more, try to
1476 : * step to the next page with data.
1477 : */
1478 GIC 11957219 : if (ScanDirectionIsForward(dir))
1479 : {
1480 11948015 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1481 ECB : {
1482 GIC 1161767 : if (!_bt_steppage(scan, dir))
1483 CBC 1147466 : return false;
1484 : }
1485 ECB : }
1486 : else
1487 : {
1488 GIC 9204 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1489 : {
1490 37 : if (!_bt_steppage(scan, dir))
1491 CBC 31 : return false;
1492 : }
1493 ECB : }
1494 :
1495 : /* OK, itemIndex says what to return */
1496 GIC 10809722 : currItem = &so->currPos.items[so->currPos.itemIndex];
1497 10809722 : scan->xs_heaptid = currItem->heapTid;
1498 10809722 : if (scan->xs_want_itup)
1499 CBC 1788657 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1500 ECB :
1501 CBC 10809722 : return true;
1502 ECB : }
1503 :
1504 : /*
1505 : * _bt_readpage() -- Load data from current index page into so->currPos
1506 : *
1507 : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1508 : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1509 : * they are updated as appropriate. All other fields of so->currPos are
1510 : * initialized from scratch here.
1511 : *
1512 : * We scan the current page starting at offnum and moving in the indicated
1513 : * direction. All items matching the scan keys are loaded into currPos.items.
1514 : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1515 : * that there can be no more matching tuples in the current scan direction.
1516 : *
1517 : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1518 : * prior to calling this function; this function will invoke
1519 : * _bt_parallel_release before returning.
1520 : *
1521 : * Returns true if any matching items found on the page, false if none.
1522 : */
1523 : static bool
1524 GIC 8712708 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
1525 : {
1526 8712708 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1527 ECB : Page page;
1528 : BTPageOpaque opaque;
1529 : OffsetNumber minoff;
1530 : OffsetNumber maxoff;
1531 : int itemIndex;
1532 : bool continuescan;
1533 : int indnatts;
1534 :
1535 : /*
1536 : * We must have the buffer pinned and locked, but the usual macro can't be
1537 : * used here; this function is what makes it good for currPos.
1538 : */
1539 GIC 8712708 : Assert(BufferIsValid(so->currPos.buf));
1540 :
1541 8712708 : page = BufferGetPage(so->currPos.buf);
1542 CBC 8712708 : opaque = BTPageGetOpaque(page);
1543 :
1544 ECB : /* allow next page be processed by parallel worker */
1545 CBC 8712708 : if (scan->parallel_scan)
1546 : {
1547 GIC 644 : if (ScanDirectionIsForward(dir))
1548 CBC 644 : _bt_parallel_release(scan, opaque->btpo_next);
1549 : else
1550 LBC 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
1551 ECB : }
1552 :
1553 GBC 8712708 : continuescan = true; /* default assumption */
1554 GIC 8712708 : indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
1555 8712708 : minoff = P_FIRSTDATAKEY(opaque);
1556 CBC 8712708 : maxoff = PageGetMaxOffsetNumber(page);
1557 ECB :
1558 : /*
1559 : * We note the buffer's block number so that we can release the pin later.
1560 : * This allows us to re-read the buffer if it is needed again for hinting.
1561 : */
1562 GIC 8712708 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1563 :
1564 : /*
1565 ECB : * We save the LSN of the page as we read it, so that we know whether it
1566 : * safe to apply LP_DEAD hints to the page later. This allows us to drop
1567 : * the pin for MVCC scans, which allows vacuum to avoid blocking.
1568 : */
1569 GIC 8712708 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1570 :
1571 : /*
1572 ECB : * we must save the page's right-link while scanning it; this tells us
1573 : * where to step right to after we're done with these items. There is no
1574 : * corresponding need for the left-link, since splits always go right.
1575 : */
1576 GIC 8712708 : so->currPos.nextPage = opaque->btpo_next;
1577 :
1578 : /* initialize tuple workspace to empty */
1579 CBC 8712708 : so->currPos.nextTupleOffset = 0;
1580 :
1581 : /*
1582 ECB : * Now that the current page has been made consistent, the macro should be
1583 : * good.
1584 : */
1585 GIC 8712708 : Assert(BTScanPosIsPinned(so->currPos));
1586 :
1587 8712708 : if (ScanDirectionIsForward(dir))
1588 ECB : {
1589 : /* load items[] in ascending order */
1590 CBC 8691103 : itemIndex = 0;
1591 :
1592 GIC 8691103 : offnum = Max(offnum, minoff);
1593 ECB :
1594 GIC 30578310 : while (offnum <= maxoff)
1595 ECB : {
1596 GIC 27999519 : ItemId iid = PageGetItemId(page, offnum);
1597 ECB : IndexTuple itup;
1598 :
1599 : /*
1600 : * If the scan specifies not to return killed tuples, then we
1601 : * treat a killed tuple as not passing the qual
1602 : */
1603 GIC 27999519 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1604 : {
1605 1157894 : offnum = OffsetNumberNext(offnum);
1606 CBC 1157894 : continue;
1607 : }
1608 ECB :
1609 CBC 26841625 : itup = (IndexTuple) PageGetItem(page, iid);
1610 :
1611 GIC 26841625 : if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
1612 ECB : {
1613 : /* tuple passes all scan key conditions */
1614 CBC 20556895 : if (!BTreeTupleIsPosting(itup))
1615 : {
1616 : /* Remember it */
1617 20272487 : _bt_saveitem(so, itemIndex, offnum, itup);
1618 GIC 20272487 : itemIndex++;
1619 : }
1620 ECB : else
1621 : {
1622 : int tupleOffset;
1623 :
1624 : /*
1625 : * Set up state to return posting list, and remember first
1626 : * TID
1627 : */
1628 : tupleOffset =
1629 GIC 284408 : _bt_setuppostingitems(so, itemIndex, offnum,
1630 : BTreeTupleGetPostingN(itup, 0),
1631 : itup);
1632 CBC 284408 : itemIndex++;
1633 : /* Remember additional TIDs */
1634 GIC 1444153 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1635 ECB : {
1636 GIC 1159745 : _bt_savepostingitem(so, itemIndex, offnum,
1637 ECB : BTreeTupleGetPostingN(itup, i),
1638 : tupleOffset);
1639 CBC 1159745 : itemIndex++;
1640 : }
1641 : }
1642 ECB : }
1643 : /* When !continuescan, there can't be any more matches, so stop */
1644 GIC 26841625 : if (!continuescan)
1645 6112312 : break;
1646 :
1647 CBC 20729313 : offnum = OffsetNumberNext(offnum);
1648 ECB : }
1649 :
1650 : /*
1651 : * We don't need to visit page to the right when the high key
1652 : * indicates that no more matches will be found there.
1653 : *
1654 : * Checking the high key like this works out more often than you might
1655 : * think. Leaf page splits pick a split point between the two most
1656 : * dissimilar tuples (this is weighed against the need to evenly share
1657 : * free space). Leaf pages with high key attribute values that can
1658 : * only appear on non-pivot tuples on the right sibling page are
1659 : * common.
1660 : */
1661 GIC 8691103 : if (continuescan && !P_RIGHTMOST(opaque))
1662 : {
1663 62692 : ItemId iid = PageGetItemId(page, P_HIKEY);
1664 CBC 62692 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1665 : int truncatt;
1666 ECB :
1667 CBC 62692 : truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
1668 GIC 62692 : _bt_checkkeys(scan, itup, truncatt, dir, &continuescan);
1669 : }
1670 ECB :
1671 CBC 8691103 : if (!continuescan)
1672 GIC 6147219 : so->currPos.moreRight = false;
1673 :
1674 CBC 8691103 : Assert(itemIndex <= MaxTIDsPerBTreePage);
1675 8691103 : so->currPos.firstItem = 0;
1676 GIC 8691103 : so->currPos.lastItem = itemIndex - 1;
1677 CBC 8691103 : so->currPos.itemIndex = 0;
1678 ECB : }
1679 : else
1680 : {
1681 : /* load items[] in descending order */
1682 GIC 21605 : itemIndex = MaxTIDsPerBTreePage;
1683 :
1684 21605 : offnum = Min(offnum, maxoff);
1685 ECB :
1686 GIC 4336927 : while (offnum >= minoff)
1687 ECB : {
1688 GIC 4315364 : ItemId iid = PageGetItemId(page, offnum);
1689 ECB : IndexTuple itup;
1690 : bool tuple_alive;
1691 : bool passes_quals;
1692 :
1693 : /*
1694 : * If the scan specifies not to return killed tuples, then we
1695 : * treat a killed tuple as not passing the qual. Most of the
1696 : * time, it's a win to not bother examining the tuple's index
1697 : * keys, but just skip to the next tuple (previous, actually,
1698 : * since we're scanning backwards). However, if this is the first
1699 : * tuple on the page, we do check the index keys, to prevent
1700 : * uselessly advancing to the page to the left. This is similar
1701 : * to the high key optimization used by forward scans.
1702 : */
1703 GIC 4315364 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1704 : {
1705 490854 : Assert(offnum >= P_FIRSTDATAKEY(opaque));
1706 CBC 490854 : if (offnum > P_FIRSTDATAKEY(opaque))
1707 : {
1708 490675 : offnum = OffsetNumberPrev(offnum);
1709 490675 : continue;
1710 : }
1711 ECB :
1712 CBC 179 : tuple_alive = false;
1713 : }
1714 : else
1715 3824510 : tuple_alive = true;
1716 :
1717 GIC 3824689 : itup = (IndexTuple) PageGetItem(page, iid);
1718 ECB :
1719 GIC 3824689 : passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
1720 ECB : &continuescan);
1721 GIC 3824689 : if (passes_quals && tuple_alive)
1722 ECB : {
1723 : /* tuple passes all scan key conditions */
1724 CBC 3824468 : if (!BTreeTupleIsPosting(itup))
1725 : {
1726 : /* Remember it */
1727 3798388 : itemIndex--;
1728 GIC 3798388 : _bt_saveitem(so, itemIndex, offnum, itup);
1729 : }
1730 ECB : else
1731 : {
1732 : int tupleOffset;
1733 :
1734 : /*
1735 : * Set up state to return posting list, and remember first
1736 : * TID.
1737 : *
1738 : * Note that we deliberately save/return items from
1739 : * posting lists in ascending heap TID order for backwards
1740 : * scans. This allows _bt_killitems() to make a
1741 : * consistent assumption about the order of items
1742 : * associated with the same posting list tuple.
1743 : */
1744 GIC 26080 : itemIndex--;
1745 : tupleOffset =
1746 26080 : _bt_setuppostingitems(so, itemIndex, offnum,
1747 ECB : BTreeTupleGetPostingN(itup, 0),
1748 : itup);
1749 : /* Remember additional TIDs */
1750 GIC 76589 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1751 : {
1752 50509 : itemIndex--;
1753 CBC 50509 : _bt_savepostingitem(so, itemIndex, offnum,
1754 : BTreeTupleGetPostingN(itup, i),
1755 ECB : tupleOffset);
1756 : }
1757 : }
1758 : }
1759 GIC 3824689 : if (!continuescan)
1760 : {
1761 : /* there can't be any more matches, so stop */
1762 CBC 42 : so->currPos.moreLeft = false;
1763 GIC 42 : break;
1764 : }
1765 ECB :
1766 CBC 3824647 : offnum = OffsetNumberPrev(offnum);
1767 : }
1768 :
1769 21605 : Assert(itemIndex >= 0);
1770 GIC 21605 : so->currPos.firstItem = itemIndex;
1771 21605 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1772 CBC 21605 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1773 ECB : }
1774 :
1775 CBC 8712708 : return (so->currPos.firstItem <= so->currPos.lastItem);
1776 : }
1777 :
1778 ECB : /* Save an index item into so->currPos.items[itemIndex] */
1779 : static void
1780 GIC 24070875 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1781 : OffsetNumber offnum, IndexTuple itup)
1782 : {
1783 CBC 24070875 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1784 :
1785 GIC 24070875 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1786 ECB :
1787 GIC 24070875 : currItem->heapTid = itup->t_tid;
1788 CBC 24070875 : currItem->indexOffset = offnum;
1789 GIC 24070875 : if (so->currTuples)
1790 ECB : {
1791 CBC 9999342 : Size itupsz = IndexTupleSize(itup);
1792 ECB :
1793 GIC 9999342 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1794 CBC 9999342 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1795 GIC 9999342 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1796 ECB : }
1797 CBC 24070875 : }
1798 ECB :
1799 : /*
1800 : * Setup state to save TIDs/items from a single posting list tuple.
1801 : *
1802 : * Saves an index item into so->currPos.items[itemIndex] for TID that is
1803 : * returned to scan first. Second or subsequent TIDs for posting list should
1804 : * be saved by calling _bt_savepostingitem().
1805 : *
1806 : * Returns an offset into tuple storage space that main tuple is stored at if
1807 : * needed.
1808 : */
1809 : static int
1810 GIC 310488 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1811 : ItemPointer heapTid, IndexTuple itup)
1812 : {
1813 CBC 310488 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1814 :
1815 GIC 310488 : Assert(BTreeTupleIsPosting(itup));
1816 ECB :
1817 GIC 310488 : currItem->heapTid = *heapTid;
1818 CBC 310488 : currItem->indexOffset = offnum;
1819 GIC 310488 : if (so->currTuples)
1820 ECB : {
1821 : /* Save base IndexTuple (truncate posting list) */
1822 : IndexTuple base;
1823 GIC 133414 : Size itupsz = BTreeTupleGetPostingOffset(itup);
1824 :
1825 133414 : itupsz = MAXALIGN(itupsz);
1826 CBC 133414 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1827 GIC 133414 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1828 CBC 133414 : memcpy(base, itup, itupsz);
1829 ECB : /* Defensively reduce work area index tuple header size */
1830 CBC 133414 : base->t_info &= ~INDEX_SIZE_MASK;
1831 133414 : base->t_info |= itupsz;
1832 GIC 133414 : so->currPos.nextTupleOffset += itupsz;
1833 ECB :
1834 CBC 133414 : return currItem->tupleOffset;
1835 ECB : }
1836 :
1837 CBC 177074 : return 0;
1838 : }
1839 :
1840 ECB : /*
1841 : * Save an index item into so->currPos.items[itemIndex] for current posting
1842 : * tuple.
1843 : *
1844 : * Assumes that _bt_setuppostingitems() has already been called for current
1845 : * posting list tuple. Caller passes its return value as tupleOffset.
1846 : */
1847 : static inline void
1848 GIC 1210254 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1849 : ItemPointer heapTid, int tupleOffset)
1850 : {
1851 CBC 1210254 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1852 :
1853 GIC 1210254 : currItem->heapTid = *heapTid;
1854 CBC 1210254 : currItem->indexOffset = offnum;
1855 :
1856 ECB : /*
1857 : * Have index-only scans return the same base IndexTuple for every TID
1858 : * that originates from the same posting list
1859 : */
1860 GIC 1210254 : if (so->currTuples)
1861 465558 : currItem->tupleOffset = tupleOffset;
1862 1210254 : }
1863 ECB :
1864 : /*
1865 : * _bt_steppage() -- Step to next page containing valid data for scan
1866 : *
1867 : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
1868 : * if pinned, we'll drop the pin before moving to next page. The buffer is
1869 : * not locked on entry.
1870 : *
1871 : * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
1872 : * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
1873 : * to InvalidBuffer. We return true to indicate success.
1874 : */
1875 : static bool
1876 GIC 4851051 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
1877 : {
1878 4851051 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1879 CBC 4851051 : BlockNumber blkno = InvalidBlockNumber;
1880 : bool status;
1881 ECB :
1882 CBC 4851051 : Assert(BTScanPosIsValid(so->currPos));
1883 :
1884 : /* Before leaving current page, deal with any killed items */
1885 4851051 : if (so->numKilled > 0)
1886 GIC 39535 : _bt_killitems(scan);
1887 :
1888 ECB : /*
1889 : * Before we modify currPos, make a copy of the page data if there was a
1890 : * mark position that needs it.
1891 : */
1892 GIC 4851051 : if (so->markItemIndex >= 0)
1893 : {
1894 : /* bump pin on current buffer for assignment to mark buffer */
1895 CBC 183 : if (BTScanPosIsPinned(so->currPos))
1896 GIC 175 : IncrBufferRefCount(so->currPos.buf);
1897 183 : memcpy(&so->markPos, &so->currPos,
1898 ECB : offsetof(BTScanPosData, items[1]) +
1899 CBC 183 : so->currPos.lastItem * sizeof(BTScanPosItem));
1900 183 : if (so->markTuples)
1901 GIC 174 : memcpy(so->markTuples, so->currTuples,
1902 CBC 174 : so->currPos.nextTupleOffset);
1903 183 : so->markPos.itemIndex = so->markItemIndex;
1904 183 : so->markItemIndex = -1;
1905 ECB : }
1906 :
1907 CBC 4851051 : if (ScanDirectionIsForward(dir))
1908 : {
1909 : /* Walk right to the next page with data */
1910 4851005 : if (scan->parallel_scan != NULL)
1911 : {
1912 : /*
1913 ECB : * Seize the scan to get the next block number; if the scan has
1914 : * ended already, bail out.
1915 : */
1916 GIC 644 : status = _bt_parallel_seize(scan, &blkno);
1917 644 : if (!status)
1918 : {
1919 ECB : /* release the previous buffer, if pinned */
1920 CBC 6 : BTScanPosUnpinIfPinned(so->currPos);
1921 GIC 6 : BTScanPosInvalidate(so->currPos);
1922 6 : return false;
1923 ECB : }
1924 : }
1925 : else
1926 : {
1927 : /* Not parallel, so use the previously-saved nextPage link. */
1928 GIC 4850361 : blkno = so->currPos.nextPage;
1929 : }
1930 :
1931 ECB : /* Remember we left a page with data */
1932 GIC 4850999 : so->currPos.moreLeft = true;
1933 :
1934 : /* release the previous buffer, if pinned */
1935 CBC 4850999 : BTScanPosUnpinIfPinned(so->currPos);
1936 : }
1937 : else
1938 ECB : {
1939 : /* Remember we left a page with data */
1940 GIC 46 : so->currPos.moreRight = true;
1941 :
1942 46 : if (scan->parallel_scan != NULL)
1943 ECB : {
1944 : /*
1945 : * Seize the scan to get the current block number; if the scan has
1946 : * ended already, bail out.
1947 : */
1948 UIC 0 : status = _bt_parallel_seize(scan, &blkno);
1949 0 : BTScanPosUnpinIfPinned(so->currPos);
1950 0 : if (!status)
1951 EUB : {
1952 UBC 0 : BTScanPosInvalidate(so->currPos);
1953 0 : return false;
1954 : }
1955 EUB : }
1956 : else
1957 : {
1958 : /* Not parallel, so just use our own notion of the current page */
1959 GIC 46 : blkno = so->currPos.currPage;
1960 : }
1961 : }
1962 ECB :
1963 GIC 4851045 : if (!_bt_readnextpage(scan, blkno, dir))
1964 4836686 : return false;
1965 :
1966 ECB : /* Drop the lock, and maybe the pin, on the current page */
1967 CBC 14359 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1968 :
1969 GIC 14359 : return true;
1970 ECB : }
1971 :
1972 : /*
1973 : * _bt_readnextpage() -- Read next page containing valid data for scan
1974 : *
1975 : * On success exit, so->currPos is updated to contain data from the next
1976 : * interesting page. Caller is responsible to release lock and pin on
1977 : * buffer on success. We return true to indicate success.
1978 : *
1979 : * If there are no more matching records in the given direction, we drop all
1980 : * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
1981 : */
1982 : static bool
1983 GIC 4851049 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
1984 : {
1985 4851049 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1986 ECB : Relation rel;
1987 : Page page;
1988 : BTPageOpaque opaque;
1989 : bool status;
1990 :
1991 GIC 4851049 : rel = scan->indexRelation;
1992 :
1993 4851049 : if (ScanDirectionIsForward(dir))
1994 ECB : {
1995 : for (;;)
1996 : {
1997 : /*
1998 : * if we're at end of scan, give up and mark parallel scan as
1999 : * done, so that all the workers can finish their scan
2000 : */
2001 GIC 4851755 : if (blkno == P_NONE || !so->currPos.moreRight)
2002 : {
2003 4836652 : _bt_parallel_done(scan);
2004 CBC 4836652 : BTScanPosInvalidate(so->currPos);
2005 GIC 4836652 : return false;
2006 ECB : }
2007 : /* check for interrupts while we're not holding any buffer lock */
2008 CBC 15103 : CHECK_FOR_INTERRUPTS();
2009 : /* step right one page */
2010 GNC 15103 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno, BT_READ);
2011 CBC 15103 : page = BufferGetPage(so->currPos.buf);
2012 GIC 15103 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
2013 CBC 15103 : opaque = BTPageGetOpaque(page);
2014 ECB : /* check for deleted page */
2015 CBC 15103 : if (!P_IGNORE(opaque))
2016 ECB : {
2017 GIC 15103 : PredicateLockPage(rel, blkno, scan->xs_snapshot);
2018 ECB : /* see if there are any matches on this page */
2019 : /* note that this will clear moreRight if we can stop */
2020 CBC 15103 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
2021 GIC 14351 : break;
2022 : }
2023 LBC 0 : else if (scan->parallel_scan != NULL)
2024 ECB : {
2025 : /* allow next page be processed by parallel worker */
2026 UBC 0 : _bt_parallel_release(scan, opaque->btpo_next);
2027 : }
2028 :
2029 EUB : /* nope, keep going */
2030 GIC 752 : if (scan->parallel_scan != NULL)
2031 : {
2032 UIC 0 : _bt_relbuf(rel, so->currPos.buf);
2033 LBC 0 : status = _bt_parallel_seize(scan, &blkno);
2034 UIC 0 : if (!status)
2035 EUB : {
2036 UBC 0 : BTScanPosInvalidate(so->currPos);
2037 0 : return false;
2038 : }
2039 EUB : }
2040 : else
2041 : {
2042 GIC 752 : blkno = opaque->btpo_next;
2043 752 : _bt_relbuf(rel, so->currPos.buf);
2044 : }
2045 ECB : }
2046 : }
2047 : else
2048 : {
2049 : /*
2050 : * Should only happen in parallel cases, when some other backend
2051 : * advanced the scan.
2052 : */
2053 GIC 46 : if (so->currPos.currPage != blkno)
2054 : {
2055 UIC 0 : BTScanPosUnpinIfPinned(so->currPos);
2056 LBC 0 : so->currPos.currPage = blkno;
2057 : }
2058 EUB :
2059 : /*
2060 : * Walk left to the next page with data. This is much more complex
2061 : * than the walk-right case because of the possibility that the page
2062 : * to our left splits while we are in flight to it, plus the
2063 : * possibility that the page we were on gets deleted after we leave
2064 : * it. See nbtree/README for details.
2065 : *
2066 : * It might be possible to rearrange this code to have less overhead
2067 : * in pinning and locking, but that would require capturing the left
2068 : * pointer when the page is initially read, and using it here, along
2069 : * with big changes to _bt_walk_left() and the code below. It is not
2070 : * clear whether this would be a win, since if the page immediately to
2071 : * the left splits after we read this page and before we step left, we
2072 : * would need to visit more pages than with the current code.
2073 : *
2074 : * Note that if we change the code so that we drop the pin for a scan
2075 : * which uses a non-MVCC snapshot, we will need to modify the code for
2076 : * walking left, to allow for the possibility that a referenced page
2077 : * has been deleted. As long as the buffer is pinned or the snapshot
2078 : * is MVCC the page cannot move past the half-dead state to fully
2079 : * deleted.
2080 : */
2081 GIC 46 : if (BTScanPosIsPinned(so->currPos))
2082 24 : _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2083 : else
2084 GNC 22 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation,
2085 : so->currPos.currPage, BT_READ);
2086 ECB :
2087 : for (;;)
2088 : {
2089 : /* Done if we know there are no matching keys to the left */
2090 GIC 48 : if (!so->currPos.moreLeft)
2091 : {
2092 21 : _bt_relbuf(rel, so->currPos.buf);
2093 21 : _bt_parallel_done(scan);
2094 CBC 21 : BTScanPosInvalidate(so->currPos);
2095 GIC 21 : return false;
2096 ECB : }
2097 :
2098 : /* Step to next physical page */
2099 GNC 27 : so->currPos.buf = _bt_walk_left(rel, scan->heapRelation,
2100 : so->currPos.buf, scan->xs_snapshot);
2101 :
2102 : /* if we're physically at end of index, return failure */
2103 CBC 27 : if (so->currPos.buf == InvalidBuffer)
2104 : {
2105 GIC 13 : _bt_parallel_done(scan);
2106 13 : BTScanPosInvalidate(so->currPos);
2107 CBC 13 : return false;
2108 : }
2109 ECB :
2110 : /*
2111 : * Okay, we managed to move left to a non-deleted page. Done if
2112 : * it's not half-dead and contains matching tuples. Else loop back
2113 : * and do it all again.
2114 : */
2115 GIC 14 : page = BufferGetPage(so->currPos.buf);
2116 14 : TestForOldSnapshot(scan->xs_snapshot, rel, page);
2117 14 : opaque = BTPageGetOpaque(page);
2118 14 : if (!P_IGNORE(opaque))
2119 ECB : {
2120 CBC 14 : PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2121 ECB : /* see if there are any matches on this page */
2122 : /* note that this will clear moreLeft if we can stop */
2123 GIC 14 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
2124 CBC 12 : break;
2125 : }
2126 UIC 0 : else if (scan->parallel_scan != NULL)
2127 ECB : {
2128 : /* allow next page be processed by parallel worker */
2129 UIC 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2130 EUB : }
2131 :
2132 : /*
2133 : * For parallel scans, get the last page scanned as it is quite
2134 : * possible that by the time we try to seize the scan, some other
2135 : * worker has already advanced the scan to a different page. We
2136 : * must continue based on the latest page scanned by any worker.
2137 : */
2138 GIC 2 : if (scan->parallel_scan != NULL)
2139 : {
2140 UIC 0 : _bt_relbuf(rel, so->currPos.buf);
2141 0 : status = _bt_parallel_seize(scan, &blkno);
2142 LBC 0 : if (!status)
2143 : {
2144 UBC 0 : BTScanPosInvalidate(so->currPos);
2145 0 : return false;
2146 EUB : }
2147 UNC 0 : so->currPos.buf = _bt_getbuf(rel, scan->heapRelation, blkno,
2148 : BT_READ);
2149 EUB : }
2150 : }
2151 : }
2152 :
2153 GIC 14363 : return true;
2154 : }
2155 :
2156 : /*
2157 : * _bt_parallel_readpage() -- Read current page containing valid data for scan
2158 ECB : *
2159 : * On success, release lock and maybe pin on buffer. We return true to
2160 : * indicate success.
2161 : */
2162 : static bool
2163 GIC 4 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2164 : {
2165 4 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2166 :
2167 4 : _bt_initialize_more_data(so, dir);
2168 ECB :
2169 GIC 4 : if (!_bt_readnextpage(scan, blkno, dir))
2170 LBC 0 : return false;
2171 :
2172 ECB : /* Drop the lock, and maybe the pin, on the current page */
2173 GIC 4 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2174 ECB :
2175 GBC 4 : return true;
2176 : }
2177 :
2178 ECB : /*
2179 : * _bt_walk_left() -- step left one page, if possible
2180 : *
2181 : * The given buffer must be pinned and read-locked. This will be dropped
2182 : * before stepping left. On return, we have pin and read lock on the
2183 : * returned page, instead.
2184 : *
2185 : * Returns InvalidBuffer if there is no page to the left (no lock is held
2186 : * in that case).
2187 : *
2188 : * When working on a non-leaf level, it is possible for the returned page
2189 : * to be half-dead; the caller should check that condition and step left
2190 : * again if it's important.
2191 : */
2192 : static Buffer
2193 GNC 27 : _bt_walk_left(Relation rel, Relation heaprel, Buffer buf, Snapshot snapshot)
2194 : {
2195 : Page page;
2196 : BTPageOpaque opaque;
2197 :
2198 CBC 27 : page = BufferGetPage(buf);
2199 GIC 27 : opaque = BTPageGetOpaque(page);
2200 :
2201 : for (;;)
2202 UIC 0 : {
2203 ECB : BlockNumber obknum;
2204 : BlockNumber lblkno;
2205 : BlockNumber blkno;
2206 : int tries;
2207 EUB :
2208 : /* if we're at end of tree, release buf and return failure */
2209 GIC 27 : if (P_LEFTMOST(opaque))
2210 : {
2211 13 : _bt_relbuf(rel, buf);
2212 13 : break;
2213 : }
2214 ECB : /* remember original page we are stepping left from */
2215 GIC 14 : obknum = BufferGetBlockNumber(buf);
2216 ECB : /* step left */
2217 CBC 14 : blkno = lblkno = opaque->btpo_prev;
2218 GIC 14 : _bt_relbuf(rel, buf);
2219 : /* check for interrupts while we're not holding any buffer lock */
2220 CBC 14 : CHECK_FOR_INTERRUPTS();
2221 GNC 14 : buf = _bt_getbuf(rel, heaprel, blkno, BT_READ);
2222 CBC 14 : page = BufferGetPage(buf);
2223 14 : TestForOldSnapshot(snapshot, rel, page);
2224 GIC 14 : opaque = BTPageGetOpaque(page);
2225 ECB :
2226 : /*
2227 : * If this isn't the page we want, walk right till we find what we
2228 : * want --- but go no more than four hops (an arbitrary limit). If we
2229 : * don't find the correct page by then, the most likely bet is that
2230 : * the original page got deleted and isn't in the sibling chain at all
2231 : * anymore, not that its left sibling got split more than four times.
2232 : *
2233 : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2234 : * because half-dead pages are still in the sibling chain. Caller
2235 : * must reject half-dead pages if wanted.
2236 : */
2237 GIC 14 : tries = 0;
2238 : for (;;)
2239 : {
2240 14 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2241 : {
2242 ECB : /* Found desired page, return it */
2243 GIC 14 : return buf;
2244 : }
2245 LBC 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2246 : break;
2247 UIC 0 : blkno = opaque->btpo_next;
2248 LBC 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2249 UIC 0 : page = BufferGetPage(buf);
2250 UBC 0 : TestForOldSnapshot(snapshot, rel, page);
2251 UIC 0 : opaque = BTPageGetOpaque(page);
2252 EUB : }
2253 :
2254 : /* Return to the original page to see what's up */
2255 UBC 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2256 0 : page = BufferGetPage(buf);
2257 UIC 0 : TestForOldSnapshot(snapshot, rel, page);
2258 0 : opaque = BTPageGetOpaque(page);
2259 0 : if (P_ISDELETED(opaque))
2260 EUB : {
2261 : /*
2262 : * It was deleted. Move right to first nondeleted page (there
2263 : * must be one); that is the page that has acquired the deleted
2264 : * one's keyspace, so stepping left from it will take us where we
2265 : * want to be.
2266 : */
2267 : for (;;)
2268 : {
2269 UIC 0 : if (P_RIGHTMOST(opaque))
2270 0 : elog(ERROR, "fell off the end of index \"%s\"",
2271 : RelationGetRelationName(rel));
2272 0 : blkno = opaque->btpo_next;
2273 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2274 UBC 0 : page = BufferGetPage(buf);
2275 0 : TestForOldSnapshot(snapshot, rel, page);
2276 UIC 0 : opaque = BTPageGetOpaque(page);
2277 UBC 0 : if (!P_ISDELETED(opaque))
2278 0 : break;
2279 EUB : }
2280 :
2281 : /*
2282 : * Now return to top of loop, resetting obknum to point to this
2283 : * nondeleted page, and try again.
2284 : */
2285 : }
2286 : else
2287 : {
2288 : /*
2289 : * It wasn't deleted; the explanation had better be that the page
2290 : * to the left got split or deleted. Without this check, we'd go
2291 : * into an infinite loop if there's anything wrong.
2292 : */
2293 UIC 0 : if (opaque->btpo_prev == lblkno)
2294 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2295 : obknum, RelationGetRelationName(rel));
2296 : /* Okay to try again with new lblkno value */
2297 : }
2298 EUB : }
2299 :
2300 GIC 13 : return InvalidBuffer;
2301 : }
2302 :
2303 : /*
2304 : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2305 ECB : *
2306 : * If the index is empty, we will return InvalidBuffer; any other failure
2307 : * condition causes ereport(). We will not return a dead page.
2308 : *
2309 : * The returned buffer is pinned and read-locked.
2310 : */
2311 : Buffer
2312 GNC 31969 : _bt_get_endpoint(Relation rel, Relation heaprel, uint32 level, bool rightmost,
2313 : Snapshot snapshot)
2314 : {
2315 : Buffer buf;
2316 : Page page;
2317 ECB : BTPageOpaque opaque;
2318 : OffsetNumber offnum;
2319 : BlockNumber blkno;
2320 : IndexTuple itup;
2321 :
2322 : /*
2323 : * If we are looking for a leaf page, okay to descend from fast root;
2324 : * otherwise better descend from true root. (There is no point in being
2325 : * smarter about intermediate levels.)
2326 : */
2327 GIC 31969 : if (level == 0)
2328 GNC 31957 : buf = _bt_getroot(rel, heaprel, BT_READ);
2329 : else
2330 12 : buf = _bt_gettrueroot(rel, heaprel);
2331 :
2332 CBC 31969 : if (!BufferIsValid(buf))
2333 3197 : return InvalidBuffer;
2334 :
2335 28772 : page = BufferGetPage(buf);
2336 GIC 28772 : TestForOldSnapshot(snapshot, rel, page);
2337 CBC 28772 : opaque = BTPageGetOpaque(page);
2338 ECB :
2339 : for (;;)
2340 : {
2341 : /*
2342 : * If we landed on a deleted page, step right to find a live page
2343 : * (there must be one). Also, if we want the rightmost page, step
2344 : * right if needed to get to it (this could happen if the page split
2345 : * since we obtained a pointer to it).
2346 : */
2347 GIC 42145 : while (P_IGNORE(opaque) ||
2348 27 : (rightmost && !P_RIGHTMOST(opaque)))
2349 : {
2350 UIC 0 : blkno = opaque->btpo_next;
2351 0 : if (blkno == P_NONE)
2352 LBC 0 : elog(ERROR, "fell off the end of index \"%s\"",
2353 ECB : RelationGetRelationName(rel));
2354 UIC 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2355 UBC 0 : page = BufferGetPage(buf);
2356 0 : TestForOldSnapshot(snapshot, rel, page);
2357 0 : opaque = BTPageGetOpaque(page);
2358 : }
2359 EUB :
2360 : /* Done? */
2361 GBC 42145 : if (opaque->btpo_level == level)
2362 28772 : break;
2363 GIC 13373 : if (opaque->btpo_level < level)
2364 UIC 0 : ereport(ERROR,
2365 : (errcode(ERRCODE_INDEX_CORRUPTED),
2366 ECB : errmsg_internal("btree level %u not found in index \"%s\"",
2367 : level, RelationGetRelationName(rel))));
2368 :
2369 EUB : /* Descend to leftmost or rightmost child page */
2370 GIC 13373 : if (rightmost)
2371 3 : offnum = PageGetMaxOffsetNumber(page);
2372 : else
2373 13370 : offnum = P_FIRSTDATAKEY(opaque);
2374 :
2375 CBC 13373 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
2376 13373 : blkno = BTreeTupleGetDownLink(itup);
2377 :
2378 13373 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2379 GIC 13373 : page = BufferGetPage(buf);
2380 CBC 13373 : opaque = BTPageGetOpaque(page);
2381 ECB : }
2382 :
2383 CBC 28772 : return buf;
2384 ECB : }
2385 :
2386 : /*
2387 : * _bt_endpoint() -- Find the first or last page in the index, and scan
2388 : * from there to the first key satisfying all the quals.
2389 : *
2390 : * This is used by _bt_first() to set up a scan when we've determined
2391 : * that the scan must start at the beginning or end of the index (for
2392 : * a forward or backward scan respectively). Exit conditions are the
2393 : * same as for _bt_first().
2394 : */
2395 : static bool
2396 GIC 31957 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2397 : {
2398 31957 : Relation rel = scan->indexRelation;
2399 31957 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2400 : Buffer buf;
2401 ECB : Page page;
2402 : BTPageOpaque opaque;
2403 : OffsetNumber start;
2404 : BTScanPosItem *currItem;
2405 :
2406 : /*
2407 : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2408 : * version of _bt_search(). We don't maintain a stack since we know we
2409 : * won't need it.
2410 : */
2411 GNC 31957 : buf = _bt_get_endpoint(rel, scan->heapRelation, 0,
2412 : ScanDirectionIsBackward(dir), scan->xs_snapshot);
2413 :
2414 GIC 31957 : if (!BufferIsValid(buf))
2415 : {
2416 : /*
2417 ECB : * Empty index. Lock the whole relation, as nothing finer to lock
2418 : * exists.
2419 : */
2420 CBC 3197 : PredicateLockRelation(rel, scan->xs_snapshot);
2421 GIC 3197 : BTScanPosInvalidate(so->currPos);
2422 3197 : return false;
2423 : }
2424 :
2425 28760 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2426 CBC 28760 : page = BufferGetPage(buf);
2427 28760 : opaque = BTPageGetOpaque(page);
2428 28760 : Assert(P_ISLEAF(opaque));
2429 :
2430 GIC 28760 : if (ScanDirectionIsForward(dir))
2431 ECB : {
2432 : /* There could be dead pages to the left, so not this: */
2433 : /* Assert(P_LEFTMOST(opaque)); */
2434 :
2435 GIC 28736 : start = P_FIRSTDATAKEY(opaque);
2436 ECB : }
2437 GIC 24 : else if (ScanDirectionIsBackward(dir))
2438 : {
2439 24 : Assert(P_RIGHTMOST(opaque));
2440 :
2441 CBC 24 : start = PageGetMaxOffsetNumber(page);
2442 : }
2443 ECB : else
2444 : {
2445 LBC 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2446 : start = 0; /* keep compiler quiet */
2447 ECB : }
2448 :
2449 : /* remember which buffer we have pinned */
2450 GIC 28760 : so->currPos.buf = buf;
2451 EUB :
2452 GIC 28760 : _bt_initialize_more_data(so, dir);
2453 :
2454 : /*
2455 : * Now load data from the first page of the scan.
2456 ECB : */
2457 GIC 28760 : if (!_bt_readpage(scan, dir, start))
2458 ECB : {
2459 : /*
2460 : * There's no actually-matching data on this page. Try to advance to
2461 : * the next page. Return false if there's no matching data at all.
2462 : */
2463 CBC 820 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2464 GIC 820 : if (!_bt_steppage(scan, dir))
2465 781 : return false;
2466 : }
2467 : else
2468 : {
2469 ECB : /* Drop the lock, and maybe the pin, on the current page */
2470 CBC 27940 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2471 ECB : }
2472 :
2473 : /* OK, itemIndex says what to return */
2474 GIC 27979 : currItem = &so->currPos.items[so->currPos.itemIndex];
2475 27979 : scan->xs_heaptid = currItem->heapTid;
2476 CBC 27979 : if (scan->xs_want_itup)
2477 GIC 25940 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2478 :
2479 27979 : return true;
2480 ECB : }
2481 :
2482 : /*
2483 : * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
2484 : * for scan direction
2485 : */
2486 : static inline void
2487 GIC 8697595 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2488 : {
2489 : /* initialize moreLeft/moreRight appropriately for scan direction */
2490 8697595 : if (ScanDirectionIsForward(dir))
2491 : {
2492 8676004 : so->currPos.moreLeft = false;
2493 CBC 8676004 : so->currPos.moreRight = true;
2494 : }
2495 : else
2496 ECB : {
2497 GIC 21591 : so->currPos.moreLeft = true;
2498 CBC 21591 : so->currPos.moreRight = false;
2499 ECB : }
2500 GIC 8697595 : so->numKilled = 0; /* just paranoia */
2501 8697595 : so->markItemIndex = -1; /* ditto */
2502 8697595 : }
|