Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtsearch.c
4 : : * Search code for postgres btrees.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/nbtree/nbtsearch.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : :
16 : : #include "postgres.h"
17 : :
18 : : #include "access/nbtree.h"
19 : : #include "access/relscan.h"
20 : : #include "access/xact.h"
21 : : #include "miscadmin.h"
22 : : #include "pgstat.h"
23 : : #include "storage/predicate.h"
24 : : #include "utils/lsyscache.h"
25 : : #include "utils/rel.h"
26 : :
27 : :
28 : : static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
29 : : static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf);
30 : : static int _bt_binsrch_posting(BTScanInsert key, Page page,
31 : : OffsetNumber offnum);
32 : : static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
33 : : OffsetNumber offnum, bool firstPage);
34 : : static void _bt_saveitem(BTScanOpaque so, int itemIndex,
35 : : OffsetNumber offnum, IndexTuple itup);
36 : : static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex,
37 : : OffsetNumber offnum, ItemPointer heapTid,
38 : : IndexTuple itup);
39 : : static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
40 : : OffsetNumber offnum,
41 : : ItemPointer heapTid, int tupleOffset);
42 : : static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
43 : : static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
44 : : static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
45 : : ScanDirection dir);
46 : : static Buffer _bt_walk_left(Relation rel, Buffer buf);
47 : : static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
48 : : static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
49 : :
50 : :
51 : : /*
52 : : * _bt_drop_lock_and_maybe_pin()
53 : : *
54 : : * Unlock the buffer; and if it is safe to release the pin, do that, too.
55 : : * This will prevent vacuum from stalling in a blocked state trying to read a
56 : : * page when a cursor is sitting on it.
57 : : *
58 : : * See nbtree/README section on making concurrent TID recycling safe.
59 : : */
60 : : static void
3308 kgrittn@postgresql.o 61 :CBC 4524757 : _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
62 : : {
1363 pg@bowt.ie 63 : 4524757 : _bt_unlockbuf(scan->indexRelation, sp->buf);
64 : :
3308 kgrittn@postgresql.o 65 [ + + + + ]: 4524757 : if (IsMVCCSnapshot(scan->xs_snapshot) &&
66 [ + + + + : 4372781 : RelationNeedsWAL(scan->indexRelation) &&
+ + + + ]
67 [ + + ]: 4370178 : !scan->xs_want_itup)
68 : : {
69 : 4321242 : ReleaseBuffer(sp->buf);
70 : 4321242 : sp->buf = InvalidBuffer;
71 : : }
72 : 4524757 : }
73 : :
74 : : /*
75 : : * _bt_search() -- Search the tree for a particular scankey,
76 : : * or more precisely for the first leaf page it could be on.
77 : : *
78 : : * The passed scankey is an insertion-type scankey (see nbtree/README),
79 : : * but it can omit the rightmost column(s) of the index.
80 : : *
81 : : * Return value is a stack of parent-page pointers (i.e. there is no entry for
82 : : * the leaf level/page). *bufP is set to the address of the leaf-page buffer,
83 : : * which is locked and pinned. No locks are held on the parent pages,
84 : : * however!
85 : : *
86 : : * The returned buffer is locked according to access parameter. Additionally,
87 : : * access = BT_WRITE will allow an empty root page to be created and returned.
88 : : * When access = BT_READ, an empty index will result in *bufP being set to
89 : : * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered
90 : : * during the search will be finished.
91 : : *
92 : : * heaprel must be provided by callers that pass access = BT_WRITE, since we
93 : : * might need to allocate a new root page for caller -- see _bt_allocbuf.
94 : : */
95 : : BTStack
379 andres@anarazel.de 96 : 9915476 : _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP,
97 : : int access)
98 : : {
8424 bruce@momjian.us 99 : 9915476 : BTStack stack_in = NULL;
2087 akorotkov@postgresql 100 : 9915476 : int page_access = BT_READ;
101 : :
102 : : /* heaprel must be set whenever _bt_allocbuf is reachable */
309 pg@bowt.ie 103 [ + + - + ]: 9915476 : Assert(access == BT_READ || access == BT_WRITE);
104 [ + + - + ]: 9915476 : Assert(access == BT_READ || heaprel != NULL);
105 : :
106 : : /* Get the root page to start with */
379 andres@anarazel.de 107 : 9915476 : *bufP = _bt_getroot(rel, heaprel, access);
108 : :
109 : : /* If index is empty and access = BT_READ, no root page is created. */
8424 bruce@momjian.us 110 [ + + ]: 9915475 : if (!BufferIsValid(*bufP))
8668 tgl@sss.pgh.pa.us 111 : 244779 : return (BTStack) NULL;
112 : :
113 : : /* Loop iterates once per level descended in the tree */
114 : : for (;;)
115 : 8015362 : {
116 : : Page page;
117 : : BTPageOpaque opaque;
118 : : OffsetNumber offnum;
119 : : ItemId itemid;
120 : : IndexTuple itup;
121 : : BlockNumber child;
122 : : BTStack new_stack;
123 : :
124 : : /*
125 : : * Race -- the page we just grabbed may have split since we read its
126 : : * downlink in its parent page (or the metapage). If it has, we may
127 : : * need to move right to its new sibling. Do that.
128 : : *
129 : : * In write-mode, allow _bt_moveright to finish any incomplete splits
130 : : * along the way. Strictly speaking, we'd only need to finish an
131 : : * incomplete split on the leaf page we're about to insert to, not on
132 : : * any of the upper levels (internal pages with incomplete splits are
133 : : * also taken care of in _bt_getstackbuf). But this is a good
134 : : * opportunity to finish splits of internal pages too.
135 : : */
379 andres@anarazel.de 136 : 17686058 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE),
137 : : stack_in, page_access);
138 : :
139 : : /* if this is a leaf page, we're done */
2916 kgrittn@postgresql.o 140 : 17686058 : page = BufferGetPage(*bufP);
744 michael@paquier.xyz 141 : 17686058 : opaque = BTPageGetOpaque(page);
8668 tgl@sss.pgh.pa.us 142 [ + + ]: 17686058 : if (P_ISLEAF(opaque))
143 : 9670696 : break;
144 : :
145 : : /*
146 : : * Find the appropriate pivot tuple on this page. Its downlink points
147 : : * to the child page that we're about to descend to.
148 : : */
1852 pg@bowt.ie 149 : 8015362 : offnum = _bt_binsrch(rel, key, *bufP);
8668 tgl@sss.pgh.pa.us 150 : 8015362 : itemid = PageGetItemId(page, offnum);
6654 151 : 8015362 : itup = (IndexTuple) PageGetItem(page, itemid);
1509 pg@bowt.ie 152 [ - + - - ]: 8015362 : Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
1382 153 : 8015362 : child = BTreeTupleGetDownLink(itup);
154 : :
155 : : /*
156 : : * We need to save the location of the pivot tuple we chose in a new
157 : : * stack entry for this page/level. If caller ends up splitting a
158 : : * page one level down, it usually ends up inserting a new pivot
159 : : * tuple/downlink immediately after the location recorded here.
160 : : */
8668 tgl@sss.pgh.pa.us 161 : 8015362 : new_stack = (BTStack) palloc(sizeof(BTStackData));
1382 pg@bowt.ie 162 : 8015362 : new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
8668 tgl@sss.pgh.pa.us 163 : 8015362 : new_stack->bts_offset = offnum;
164 : 8015362 : new_stack->bts_parent = stack_in;
165 : :
166 : : /*
167 : : * Page level 1 is lowest non-leaf page level prior to leaves. So, if
168 : : * we're on the level 1 and asked to lock leaf page in write mode,
169 : : * then lock next page in write mode, because it must be a leaf.
170 : : */
1145 pg@bowt.ie 171 [ + + + + ]: 8015362 : if (opaque->btpo_level == 1 && access == BT_WRITE)
2087 akorotkov@postgresql 172 : 2952983 : page_access = BT_WRITE;
173 : :
174 : : /* drop the read lock on the page, then acquire one on its child */
1382 pg@bowt.ie 175 : 8015362 : *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
176 : :
177 : : /* okay, all set to move down a level */
8668 tgl@sss.pgh.pa.us 178 : 8015362 : stack_in = new_stack;
179 : : }
180 : :
181 : : /*
182 : : * If we're asked to lock leaf in write mode, but didn't manage to, then
183 : : * relock. This should only happen when the root page is a leaf page (and
184 : : * the only page in the index other than the metapage).
185 : : */
2087 akorotkov@postgresql 186 [ + + + + ]: 9670696 : if (access == BT_WRITE && page_access == BT_READ)
187 : : {
188 : : /* trade in our read lock for a write lock */
1363 pg@bowt.ie 189 : 375063 : _bt_unlockbuf(rel, *bufP);
190 : 375063 : _bt_lockbuf(rel, *bufP, BT_WRITE);
191 : :
192 : : /*
193 : : * Race -- the leaf page may have split after we dropped the read lock
194 : : * but before we acquired a write lock. If it has, we may need to
195 : : * move right to its new sibling. Do that.
196 : : */
219 tmunro@postgresql.or 197 :GNC 375063 : *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE);
198 : : }
199 : :
8668 tgl@sss.pgh.pa.us 200 :CBC 9670696 : return stack_in;
201 : : }
202 : :
203 : : /*
204 : : * _bt_moveright() -- move right in the btree if necessary.
205 : : *
206 : : * When we follow a pointer to reach a page, it is possible that
207 : : * the page has changed in the meanwhile. If this happens, we're
208 : : * guaranteed that the page has "split right" -- that is, that any
209 : : * data that appeared on the page originally is either on the page
210 : : * or strictly to the right of it.
211 : : *
212 : : * This routine decides whether or not we need to move right in the
213 : : * tree by examining the high key entry on the page. If that entry is
214 : : * strictly less than the scankey, or <= the scankey in the
215 : : * key.nextkey=true case, then we followed the wrong link and we need
216 : : * to move right.
217 : : *
218 : : * The passed insertion-type scankey can omit the rightmost column(s) of the
219 : : * index. (see nbtree/README)
220 : : *
221 : : * When key.nextkey is false (the usual case), we are looking for the first
222 : : * item >= key. When key.nextkey is true, we are looking for the first item
223 : : * strictly greater than key.
224 : : *
225 : : * If forupdate is true, we will attempt to finish any incomplete splits
226 : : * that we encounter. This is required when locking a target page for an
227 : : * insertion, because we don't allow inserting on a page before the split is
228 : : * completed. 'heaprel' and 'stack' are only used if forupdate is true.
229 : : *
230 : : * On entry, we have the buffer pinned and a lock of the type specified by
231 : : * 'access'. If we move right, we release the buffer and lock and acquire
232 : : * the same on the right sibling. Return value is the buffer we stop at.
233 : : */
234 : : Buffer
10141 scrappy@hub.org 235 : 18061121 : _bt_moveright(Relation rel,
236 : : Relation heaprel,
237 : : BTScanInsert key,
238 : : Buffer buf,
239 : : bool forupdate,
240 : : BTStack stack,
241 : : int access)
242 : : {
243 : : Page page;
244 : : BTPageOpaque opaque;
245 : : int32 cmpval;
246 : :
309 pg@bowt.ie 247 [ + + - + ]: 18061121 : Assert(!forupdate || heaprel != NULL);
248 : :
249 : : /*
250 : : * When nextkey = false (normal case): if the scan key that brought us to
251 : : * this page is > the high key stored on the page, then the page has split
252 : : * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could
253 : : * have some duplicates to the right as well as the left, but that's
254 : : * something that's only ever dealt with on the leaf level, after
255 : : * _bt_search has found an initial leaf page.)
256 : : *
257 : : * When nextkey = true: move right if the scan key is >= page's high key.
258 : : * (Note that key.scantid cannot be set in this case.)
259 : : *
260 : : * The page could even have split more than once, so scan as far as
261 : : * needed.
262 : : *
263 : : * We also have to move right if we followed a link that brought us to a
264 : : * dead page.
265 : : */
1852 266 : 18061121 : cmpval = key->nextkey ? 0 : 1;
267 : :
268 : : for (;;)
269 : : {
2916 kgrittn@postgresql.o 270 : 18062462 : page = BufferGetPage(buf);
744 michael@paquier.xyz 271 : 18062462 : opaque = BTPageGetOpaque(page);
272 : :
3680 heikki.linnakangas@i 273 [ + + ]: 18062462 : if (P_RIGHTMOST(opaque))
274 : 13757685 : break;
275 : :
276 : : /*
277 : : * Finish any incomplete splits we encounter along the way.
278 : : */
279 [ + + - + ]: 4304777 : if (forupdate && P_INCOMPLETE_SPLIT(opaque))
3680 heikki.linnakangas@i 280 :UBC 0 : {
281 : 0 : BlockNumber blkno = BufferGetBlockNumber(buf);
282 : :
283 : : /* upgrade our lock if necessary */
284 [ # # ]: 0 : if (access == BT_READ)
285 : : {
1363 pg@bowt.ie 286 : 0 : _bt_unlockbuf(rel, buf);
287 : 0 : _bt_lockbuf(rel, buf, BT_WRITE);
288 : : }
289 : :
3680 heikki.linnakangas@i 290 [ # # ]: 0 : if (P_INCOMPLETE_SPLIT(opaque))
379 andres@anarazel.de 291 : 0 : _bt_finish_split(rel, heaprel, buf, stack);
292 : : else
3680 heikki.linnakangas@i 293 : 0 : _bt_relbuf(rel, buf);
294 : :
295 : : /* re-acquire the lock in the right mode, and re-check */
309 pg@bowt.ie 296 : 0 : buf = _bt_getbuf(rel, blkno, access);
3680 heikki.linnakangas@i 297 : 0 : continue;
298 : : }
299 : :
1852 pg@bowt.ie 300 [ + - + + ]:CBC 4304777 : if (P_IGNORE(opaque) || _bt_compare(rel, key, page, P_HIKEY) >= cmpval)
301 : : {
302 : : /* step right one page */
3680 heikki.linnakangas@i 303 : 1341 : buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
304 : 1341 : continue;
305 : : }
306 : : else
307 : : break;
308 : : }
309 : :
7722 tgl@sss.pgh.pa.us 310 [ - + ]: 18061121 : if (P_IGNORE(opaque))
5949 tgl@sss.pgh.pa.us 311 [ # # ]:UBC 0 : elog(ERROR, "fell off the end of index \"%s\"",
312 : : RelationGetRelationName(rel));
313 : :
9357 bruce@momjian.us 314 :CBC 18061121 : return buf;
315 : : }
316 : :
317 : : /*
318 : : * _bt_binsrch() -- Do a binary search for a key on a particular page.
319 : : *
320 : : * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
321 : : * of the last key < given scankey, or last key <= given scankey if nextkey
322 : : * is true. (Since _bt_compare treats the first data key of such a page as
323 : : * minus infinity, there will be at least one key < scankey, so the result
324 : : * always points at one of the keys on the page.)
325 : : *
326 : : * On a leaf page, _bt_binsrch() returns the final result of the initial
327 : : * positioning process that started with _bt_first's call to _bt_search.
328 : : * We're returning a non-pivot tuple offset, so things are a little different.
329 : : * It is possible that we'll return an offset that's either past the last
330 : : * non-pivot slot, or (in the case of a backward scan) before the first slot.
331 : : *
332 : : * This procedure is not responsible for walking right, it just examines
333 : : * the given page. _bt_binsrch() has no lock or refcount side effects
334 : : * on the buffer.
335 : : */
336 : : static OffsetNumber
10141 scrappy@hub.org 337 : 14154020 : _bt_binsrch(Relation rel,
338 : : BTScanInsert key,
339 : : Buffer buf)
340 : : {
341 : : Page page;
342 : : BTPageOpaque opaque;
343 : : OffsetNumber low,
344 : : high;
345 : : int32 result,
346 : : cmpval;
347 : :
2916 kgrittn@postgresql.o 348 : 14154020 : page = BufferGetPage(buf);
744 michael@paquier.xyz 349 : 14154020 : opaque = BTPageGetOpaque(page);
350 : :
351 : : /* Requesting nextkey semantics while using scantid seems nonsensical */
1679 pg@bowt.ie 352 [ + + - + ]: 14154020 : Assert(!key->nextkey || key->scantid == NULL);
353 : : /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */
354 [ + + - + ]: 14154020 : Assert(!P_ISLEAF(opaque) || key->scantid == NULL);
355 : :
8668 tgl@sss.pgh.pa.us 356 [ + + ]: 14154020 : low = P_FIRSTDATAKEY(opaque);
9716 bruce@momjian.us 357 : 14154020 : high = PageGetMaxOffsetNumber(page);
358 : :
359 : : /*
360 : : * If there are no keys on the page, return the first available slot. Note
361 : : * this covers two cases: the page is really empty (no keys), or it
362 : : * contains only a high key. The latter case is possible after vacuuming.
363 : : * This can never happen on an internal page, however, since they are
364 : : * never empty (an internal page must have at least one child).
365 : : */
1852 pg@bowt.ie 366 [ + + ]: 14154020 : if (unlikely(high < low))
9357 bruce@momjian.us 367 : 4249 : return low;
368 : :
369 : : /*
370 : : * Binary search to find the first key on the page >= scan key, or first
371 : : * key > scankey when nextkey is true.
372 : : *
373 : : * For nextkey=false (cmpval=1), the loop invariant is: all slots before
374 : : * 'low' are < scan key, all slots at or after 'high' are >= scan key.
375 : : *
376 : : * For nextkey=true (cmpval=0), the loop invariant is: all slots before
377 : : * 'low' are <= scan key, all slots at or after 'high' are > scan key.
378 : : *
379 : : * We can fall out when high == low.
380 : : */
9039 tgl@sss.pgh.pa.us 381 : 14149771 : high++; /* establish the loop invariant for high */
382 : :
1852 pg@bowt.ie 383 : 14149771 : cmpval = key->nextkey ? 0 : 1; /* select comparison value */
384 : :
9039 tgl@sss.pgh.pa.us 385 [ + + ]: 92681283 : while (high > low)
386 : : {
8768 bruce@momjian.us 387 : 78531512 : OffsetNumber mid = low + ((high - low) / 2);
388 : :
389 : : /* We have low <= mid < high, so mid points at a real slot */
390 : :
1852 pg@bowt.ie 391 : 78531512 : result = _bt_compare(rel, key, page, mid);
392 : :
7420 tgl@sss.pgh.pa.us 393 [ + + ]: 78531512 : if (result >= cmpval)
9039 394 : 49518835 : low = mid + 1;
395 : : else
396 : 29012677 : high = mid;
397 : : }
398 : :
399 : : /*
400 : : * At this point we have high == low.
401 : : *
402 : : * On a leaf page we always return the first non-pivot tuple >= scan key
403 : : * (resp. > scan key) for forward scan callers. For backward scans, it's
404 : : * always the _last_ non-pivot tuple < scan key (resp. <= scan key).
405 : : */
8668 406 [ + + ]: 14149771 : if (P_ISLEAF(opaque))
407 : : {
408 : : /*
409 : : * In the backward scan case we're supposed to locate the last
410 : : * matching tuple on the leaf level -- not the first matching tuple
411 : : * (the last tuple will be the first one returned by the scan).
412 : : *
413 : : * At this point we've located the first non-pivot tuple immediately
414 : : * after the last matching tuple (which might just be maxoff + 1).
415 : : * Compensate by stepping back.
416 : : */
128 pg@bowt.ie 417 [ + + ]:GNC 6134409 : if (key->backward)
418 : 23915 : return OffsetNumberPrev(low);
419 : :
9039 tgl@sss.pgh.pa.us 420 :CBC 6110494 : return low;
421 : : }
422 : :
423 : : /*
424 : : * On a non-leaf page, return the last key < scan key (resp. <= scan key).
425 : : * There must be one if _bt_compare() is playing by the rules.
426 : : *
427 : : * _bt_compare() will seldom see any exactly-matching pivot tuples, since
428 : : * a truncated -inf heap TID is usually enough to prevent it altogether.
429 : : * Even omitted scan key entries are treated as > truncated attributes.
430 : : *
431 : : * However, during backward scans _bt_compare() interprets omitted scan
432 : : * key attributes as == corresponding truncated -inf attributes instead.
433 : : * This works just like < would work here. Under this scheme, < strategy
434 : : * backward scans will always directly descend to the correct leaf page.
435 : : * In particular, they will never incur an "extra" leaf page access with a
436 : : * scan key that happens to contain the same prefix of values as some
437 : : * pivot tuple's untruncated prefix. VACUUM relies on this guarantee when
438 : : * it uses a leaf page high key to "re-find" a page undergoing deletion.
439 : : */
8668 440 [ + + - + ]: 8015362 : Assert(low > P_FIRSTDATAKEY(opaque));
441 : :
9039 442 : 8015362 : return OffsetNumberPrev(low);
443 : : }
444 : :
445 : : /*
446 : : *
447 : : * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search.
448 : : *
449 : : * Like _bt_binsrch(), but with support for caching the binary search
450 : : * bounds. Only used during insertion, and only on the leaf page that it
451 : : * looks like caller will insert tuple on. Exclusive-locked and pinned
452 : : * leaf page is contained within insertstate.
453 : : *
454 : : * Caches the bounds fields in insertstate so that a subsequent call can
455 : : * reuse the low and strict high bounds of original binary search. Callers
456 : : * that use these fields directly must be prepared for the case where low
457 : : * and/or stricthigh are not on the same page (one or both exceed maxoff
458 : : * for the page). The case where there are no items on the page (high <
459 : : * low) makes bounds invalid.
460 : : *
461 : : * Caller is responsible for invalidating bounds when it modifies the page
462 : : * before calling here a second time, and for dealing with posting list
463 : : * tuple matches (callers can use insertstate's postingoff field to
464 : : * determine which existing heap TID will need to be replaced by a posting
465 : : * list split).
466 : : */
467 : : OffsetNumber
1852 pg@bowt.ie 468 : 6029598 : _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
469 : : {
470 : 6029598 : BTScanInsert key = insertstate->itup_key;
471 : : Page page;
472 : : BTPageOpaque opaque;
473 : : OffsetNumber low,
474 : : high,
475 : : stricthigh;
476 : : int32 result,
477 : : cmpval;
478 : :
479 : 6029598 : page = BufferGetPage(insertstate->buf);
744 michael@paquier.xyz 480 : 6029598 : opaque = BTPageGetOpaque(page);
481 : :
1852 pg@bowt.ie 482 [ - + ]: 6029598 : Assert(P_ISLEAF(opaque));
483 [ - + ]: 6029598 : Assert(!key->nextkey);
1509 484 [ - + ]: 6029598 : Assert(insertstate->postingoff == 0);
485 : :
1852 486 [ + + ]: 6029598 : if (!insertstate->bounds_valid)
487 : : {
488 : : /* Start new binary search */
489 [ + + ]: 3571276 : low = P_FIRSTDATAKEY(opaque);
490 : 3571276 : high = PageGetMaxOffsetNumber(page);
491 : : }
492 : : else
493 : : {
494 : : /* Restore result of previous binary search against same page */
495 : 2458322 : low = insertstate->low;
496 : 2458322 : high = insertstate->stricthigh;
497 : : }
498 : :
499 : : /* If there are no keys on the page, return the first available slot */
500 [ + + ]: 6029598 : if (unlikely(high < low))
501 : : {
502 : : /* Caller can't reuse bounds */
503 : 10767 : insertstate->low = InvalidOffsetNumber;
504 : 10767 : insertstate->stricthigh = InvalidOffsetNumber;
505 : 10767 : insertstate->bounds_valid = false;
506 : 10767 : return low;
507 : : }
508 : :
509 : : /*
510 : : * Binary search to find the first key on the page >= scan key. (nextkey
511 : : * is always false when inserting).
512 : : *
513 : : * The loop invariant is: all slots before 'low' are < scan key, all slots
514 : : * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is
515 : : * maintained to save additional search effort for caller.
516 : : *
517 : : * We can fall out when high == low.
518 : : */
519 [ + + ]: 6018831 : if (!insertstate->bounds_valid)
520 : 3560509 : high++; /* establish the loop invariant for high */
521 : 6018831 : stricthigh = high; /* high initially strictly higher */
522 : :
523 : 6018831 : cmpval = 1; /* !nextkey comparison value */
524 : :
525 [ + + ]: 32127092 : while (high > low)
526 : : {
527 : 26108261 : OffsetNumber mid = low + ((high - low) / 2);
528 : :
529 : : /* We have low <= mid < high, so mid points at a real slot */
530 : :
531 : 26108261 : result = _bt_compare(rel, key, page, mid);
532 : :
533 [ + + ]: 26108261 : if (result >= cmpval)
534 : 20235637 : low = mid + 1;
535 : : else
536 : : {
537 : 5872624 : high = mid;
538 [ + + ]: 5872624 : if (result != 0)
539 : 5344681 : stricthigh = high;
540 : : }
541 : :
542 : : /*
543 : : * If tuple at offset located by binary search is a posting list whose
544 : : * TID range overlaps with caller's scantid, perform posting list
545 : : * binary search to set postingoff for caller. Caller must split the
546 : : * posting list when postingoff is set. This should happen
547 : : * infrequently.
548 : : */
1509 549 [ + + + + : 26108261 : if (unlikely(result == 0 && key->scantid != NULL))
+ + ]
550 : : {
551 : : /*
552 : : * postingoff should never be set more than once per leaf page
553 : : * binary search. That would mean that there are duplicate table
554 : : * TIDs in the index, which is never okay. Check for that here.
555 : : */
900 556 [ - + ]: 213154 : if (insertstate->postingoff != 0)
900 pg@bowt.ie 557 [ # # ]:UBC 0 : ereport(ERROR,
558 : : (errcode(ERRCODE_INDEX_CORRUPTED),
559 : : errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"",
560 : : ItemPointerGetBlockNumber(key->scantid),
561 : : ItemPointerGetOffsetNumber(key->scantid),
562 : : low, stricthigh,
563 : : BufferGetBlockNumber(insertstate->buf),
564 : : RelationGetRelationName(rel))));
565 : :
1509 pg@bowt.ie 566 :CBC 213154 : insertstate->postingoff = _bt_binsrch_posting(key, page, mid);
567 : : }
568 : : }
569 : :
570 : : /*
571 : : * On a leaf page, a binary search always returns the first key >= scan
572 : : * key (at least in !nextkey case), which could be the last slot + 1. This
573 : : * is also the lower bound of cached search.
574 : : *
575 : : * stricthigh may also be the last slot + 1, which prevents caller from
576 : : * using bounds directly, but is still useful to us if we're called a
577 : : * second time with cached bounds (cached low will be < stricthigh when
578 : : * that happens).
579 : : */
1852 580 : 6018831 : insertstate->low = low;
581 : 6018831 : insertstate->stricthigh = stricthigh;
582 : 6018831 : insertstate->bounds_valid = true;
583 : :
584 : 6018831 : return low;
585 : : }
586 : :
587 : : /*----------
588 : : * _bt_binsrch_posting() -- posting list binary search.
589 : : *
590 : : * Helper routine for _bt_binsrch_insert().
591 : : *
592 : : * Returns offset into posting list where caller's scantid belongs.
593 : : *----------
594 : : */
595 : : static int
1509 596 : 213154 : _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum)
597 : : {
598 : : IndexTuple itup;
599 : : ItemId itemid;
600 : : int low,
601 : : high,
602 : : mid,
603 : : res;
604 : :
605 : : /*
606 : : * If this isn't a posting tuple, then the index must be corrupt (if it is
607 : : * an ordinary non-pivot tuple then there must be an existing tuple with a
608 : : * heap TID that equals inserter's new heap TID/scantid). Defensively
609 : : * check that tuple is a posting list tuple whose posting list range
610 : : * includes caller's scantid.
611 : : *
612 : : * (This is also needed because contrib/amcheck's rootdescend option needs
613 : : * to be able to relocate a non-pivot tuple using _bt_binsrch_insert().)
614 : : */
615 : 213154 : itemid = PageGetItemId(page, offnum);
616 : 213154 : itup = (IndexTuple) PageGetItem(page, itemid);
617 [ + + ]: 213154 : if (!BTreeTupleIsPosting(itup))
618 : 201098 : return 0;
619 : :
620 [ + - - + ]: 12056 : Assert(key->heapkeyspace && key->allequalimage);
621 : :
622 : : /*
623 : : * In the event that posting list tuple has LP_DEAD bit set, indicate this
624 : : * to _bt_binsrch_insert() caller by returning -1, a sentinel value. A
625 : : * second call to _bt_binsrch_insert() can take place when its caller has
626 : : * removed the dead item.
627 : : */
628 [ - + ]: 12056 : if (ItemIdIsDead(itemid))
1509 pg@bowt.ie 629 :UBC 0 : return -1;
630 : :
631 : : /* "high" is past end of posting list for loop invariant */
1509 pg@bowt.ie 632 :CBC 12056 : low = 0;
633 : 12056 : high = BTreeTupleGetNPosting(itup);
634 [ - + ]: 12056 : Assert(high >= 2);
635 : :
636 [ + + ]: 96867 : while (high > low)
637 : : {
638 : 84811 : mid = low + ((high - low) / 2);
639 : 84811 : res = ItemPointerCompare(key->scantid,
640 : : BTreeTupleGetPostingN(itup, mid));
641 : :
642 [ + + ]: 84811 : if (res > 0)
643 : 44133 : low = mid + 1;
644 [ + - ]: 40678 : else if (res < 0)
645 : 40678 : high = mid;
646 : : else
1509 pg@bowt.ie 647 :UBC 0 : return mid;
648 : : }
649 : :
650 : : /* Exact match not found */
1509 pg@bowt.ie 651 :CBC 12056 : return low;
652 : : }
653 : :
654 : : /*----------
655 : : * _bt_compare() -- Compare insertion-type scankey to tuple on a page.
656 : : *
657 : : * page/offnum: location of btree item to be compared to.
658 : : *
659 : : * This routine returns:
660 : : * <0 if scankey < tuple at offnum;
661 : : * 0 if scankey == tuple at offnum;
662 : : * >0 if scankey > tuple at offnum.
663 : : *
664 : : * NULLs in the keys are treated as sortable values. Therefore
665 : : * "equality" does not necessarily mean that the item should be returned
666 : : * to the caller as a matching key. Similarly, an insertion scankey
667 : : * with its scantid set is treated as equal to a posting tuple whose TID
668 : : * range overlaps with their scantid. There generally won't be a
669 : : * matching TID in the posting tuple, which caller must handle
670 : : * themselves (e.g., by splitting the posting list tuple).
671 : : *
672 : : * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
673 : : * "minus infinity": this routine will always claim it is less than the
674 : : * scankey. The actual key value stored is explicitly truncated to 0
675 : : * attributes (explicitly minus infinity) with version 3+ indexes, but
676 : : * that isn't relied upon. This allows us to implement the Lehman and
677 : : * Yao convention that the first down-link pointer is before the first
678 : : * key. See backend/access/nbtree/README for details.
679 : : *----------
680 : : */
681 : : int32
10141 scrappy@hub.org 682 : 119490871 : _bt_compare(Relation rel,
683 : : BTScanInsert key,
684 : : Page page,
685 : : OffsetNumber offnum)
686 : : {
8668 tgl@sss.pgh.pa.us 687 : 119490871 : TupleDesc itupdesc = RelationGetDescr(rel);
744 michael@paquier.xyz 688 : 119490871 : BTPageOpaque opaque = BTPageGetOpaque(page);
689 : : IndexTuple itup;
690 : : ItemPointer heapTid;
691 : : ScanKey scankey;
692 : : int ncmpkey;
693 : : int ntupatts;
694 : : int32 result;
695 : :
1852 pg@bowt.ie 696 [ - + ]: 119490871 : Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum));
697 [ - + ]: 119490871 : Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel));
698 [ - + - - ]: 119490871 : Assert(key->heapkeyspace || key->scantid == NULL);
699 : :
700 : : /*
701 : : * Force result ">" if target item is first data item on an internal page
702 : : * --- see NOTE above.
703 : : */
8424 bruce@momjian.us 704 [ + + + + : 119490871 : if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
+ + ]
9357 705 : 1398874 : return 1;
706 : :
6654 tgl@sss.pgh.pa.us 707 : 118091997 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1852 pg@bowt.ie 708 [ + + ]: 118091997 : ntupatts = BTreeTupleGetNAtts(itup, rel);
709 : :
710 : : /*
711 : : * The scan key is set up with the attribute number associated with each
712 : : * term in the key. It is important that, if the index is multi-key, the
713 : : * scan contain the first k key attributes, and that they be in order. If
714 : : * you think about how multi-key ordering works, you'll understand why
715 : : * this is.
716 : : *
717 : : * We don't test for violation of this condition here, however. The
718 : : * initial setup for the index scan had better have gotten it right (see
719 : : * _bt_first).
720 : : */
721 : :
722 : 118091997 : ncmpkey = Min(ntupatts, key->keysz);
723 [ - + - - ]: 118091997 : Assert(key->heapkeyspace || ncmpkey == key->keysz);
1509 724 [ + + - + ]: 118091997 : Assert(!BTreeTupleIsPosting(itup) || key->allequalimage);
1852 725 : 118091997 : scankey = key->scankeys;
726 [ + + ]: 148155593 : for (int i = 1; i <= ncmpkey; i++)
727 : : {
728 : : Datum datum;
729 : : bool isNull;
730 : :
7459 tgl@sss.pgh.pa.us 731 : 137898991 : datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
732 : :
2489 733 [ + + ]: 137898991 : if (scankey->sk_flags & SK_ISNULL) /* key is NULL */
734 : : {
8668 735 [ + + ]: 251989 : if (isNull)
8720 736 : 78512 : result = 0; /* NULL "=" NULL */
6305 737 [ + + ]: 173477 : else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
738 : 132 : result = -1; /* NULL "<" NOT_NULL */
739 : : else
8720 740 : 173345 : result = 1; /* NULL ">" NOT_NULL */
741 : : }
8668 742 [ + + ]: 137647002 : else if (isNull) /* key is NOT_NULL and item is NULL */
743 : : {
6305 744 [ - + ]: 153 : if (scankey->sk_flags & SK_BT_NULLS_FIRST)
6305 tgl@sss.pgh.pa.us 745 :UBC 0 : result = 1; /* NOT_NULL ">" NULL */
746 : : else
6305 tgl@sss.pgh.pa.us 747 :CBC 153 : result = -1; /* NOT_NULL "<" NULL */
748 : : }
749 : : else
750 : : {
751 : : /*
752 : : * The sk_func needs to be passed the index value as left arg and
753 : : * the sk_argument as right arg (they might be of different
754 : : * types). Since it is convenient for callers to think of
755 : : * _bt_compare as comparing the scankey to the index item, we have
756 : : * to flip the sign of the comparison result. (Unless it's a DESC
757 : : * column, in which case we *don't* flip the sign.)
758 : : */
4751 759 : 137646849 : result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
760 : : scankey->sk_collation,
761 : : datum,
762 : : scankey->sk_argument));
763 : :
6305 764 [ + + ]: 137646849 : if (!(scankey->sk_flags & SK_BT_DESC))
2018 765 [ + + ]: 137646816 : INVERT_COMPARE_RESULT(result);
766 : : }
767 : :
768 : : /* if the keys are unequal, return the difference */
9716 bruce@momjian.us 769 [ + + ]: 137898991 : if (result != 0)
9357 770 : 107835395 : return result;
771 : :
7459 tgl@sss.pgh.pa.us 772 : 30063596 : scankey++;
773 : : }
774 : :
775 : : /*
776 : : * All non-truncated attributes (other than heap TID) were found to be
777 : : * equal. Treat truncated attributes as minus infinity when scankey has a
778 : : * key attribute value that would otherwise be compared directly.
779 : : *
780 : : * Note: it doesn't matter if ntupatts includes non-key attributes;
781 : : * scankey won't, so explicitly excluding non-key attributes isn't
782 : : * necessary.
783 : : */
1852 pg@bowt.ie 784 [ + + ]: 10256602 : if (key->keysz > ntupatts)
785 : 112682 : return 1;
786 : :
787 : : /*
788 : : * Use the heap TID attribute and scantid to try to break the tie. The
789 : : * rules are the same as any other key attribute -- only the
790 : : * representation differs.
791 : : */
792 : 10143920 : heapTid = BTreeTupleGetHeapTID(itup);
793 [ + + ]: 10143920 : if (key->scantid == NULL)
794 : : {
795 : : /*
796 : : * Forward scans have a scankey that is considered greater than a
797 : : * truncated pivot tuple if and when the scankey has equal values for
798 : : * attributes up to and including the least significant untruncated
799 : : * attribute in tuple. Even attributes that were omitted from the
800 : : * scan key are considered greater than -inf truncated attributes.
801 : : * (See _bt_binsrch for an explanation of our backward scan behavior.)
802 : : *
803 : : * For example, if an index has the minimum two attributes (single
804 : : * user key attribute, plus heap TID attribute), and a page's high key
805 : : * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
806 : : * will not descend to the page to the left. The search will descend
807 : : * right instead. The truncated attribute in pivot tuple means that
808 : : * all non-pivot tuples on the page to the left are strictly < 'foo',
809 : : * so it isn't necessary to descend left. In other words, search
810 : : * doesn't have to descend left because it isn't interested in a match
811 : : * that has a heap TID value of -inf.
812 : : *
813 : : * Note: the heap TID part of the test ensures that scankey is being
814 : : * compared to a pivot tuple with one or more truncated -inf key
815 : : * attributes. The heap TID attribute is the last key attribute in
816 : : * every index, of course, but other than that it isn't special.
817 : : */
128 pg@bowt.ie 818 [ + + + + :GNC 8186791 : if (!key->backward && key->keysz == ntupatts && heapTid == NULL &&
+ + ]
819 [ + - ]: 4658 : key->heapkeyspace)
1852 pg@bowt.ie 820 :CBC 4658 : return 1;
821 : :
822 : : /* All provided scankey arguments found to be equal */
823 : 8182133 : return 0;
824 : : }
825 : :
826 : : /*
827 : : * Treat truncated heap TID as minus infinity, since scankey has a key
828 : : * attribute value (scantid) that would otherwise be compared directly
829 : : */
830 [ - + ]: 1957129 : Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
831 [ + + ]: 1957129 : if (heapTid == NULL)
832 : 1936 : return 1;
833 : :
834 : : /*
835 : : * Scankey must be treated as equal to a posting list tuple if its scantid
836 : : * value falls within the range of the posting list. In all other cases
837 : : * there can only be a single heap TID value, which is compared directly
838 : : * with scantid.
839 : : */
840 [ - + ]: 1955193 : Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
1509 841 : 1955193 : result = ItemPointerCompare(key->scantid, heapTid);
842 [ + + + + ]: 1955193 : if (result <= 0 || !BTreeTupleIsPosting(itup))
843 : 1878692 : return result;
844 : : else
845 : : {
846 : 76501 : result = ItemPointerCompare(key->scantid,
847 : : BTreeTupleGetMaxHeapTID(itup));
848 [ + + ]: 76501 : if (result > 0)
849 : 64445 : return 1;
850 : : }
851 : :
852 : 12056 : return 0;
853 : : }
854 : :
855 : : /*
856 : : * _bt_first() -- Find the first item in a scan.
857 : : *
858 : : * We need to be clever about the direction of scan, the search
859 : : * conditions, and the tree ordering. We find the first item (or,
860 : : * if backwards scan, the last item) in the tree that satisfies the
861 : : * qualifications in the scan key. On success exit, the page containing
862 : : * the current index tuple is pinned but not locked, and data about
863 : : * the matching tuple(s) on the page has been loaded into so->currPos.
864 : : * scan->xs_heaptid is set to the heap TID of the current tuple, and if
865 : : * requested, scan->xs_itup points to a copy of the index tuple.
866 : : *
867 : : * If there are no matching items in the index, we return false, with no
868 : : * pins or locks held.
869 : : *
870 : : * Note that scan->keyData[], and the so->keyData[] scankey built from it,
871 : : * are both search-type scankeys (see nbtree/README for more about this).
872 : : * Within this routine, we build a temporary insertion-type scankey to use
873 : : * in locating the scan start position.
874 : : */
875 : : bool
10141 scrappy@hub.org 876 : 6420890 : _bt_first(IndexScanDesc scan, ScanDirection dir)
877 : : {
7996 tgl@sss.pgh.pa.us 878 : 6420890 : Relation rel = scan->indexRelation;
879 : 6420890 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
880 : : Buffer buf;
881 : : BTStack stack;
882 : : OffsetNumber offnum;
883 : : StrategyNumber strat;
884 : : BTScanInsertData inskey;
885 : : ScanKey startKeys[INDEX_MAX_KEYS];
886 : : ScanKeyData notnullkeys[INDEX_MAX_KEYS];
128 pg@bowt.ie 887 :GNC 6420890 : int keysz = 0;
888 : : int i;
889 : : bool status;
890 : : StrategyNumber strat_total;
891 : : BTScanPosItem *currItem;
892 : : BlockNumber blkno;
893 : :
3308 kgrittn@postgresql.o 894 [ + - - + :CBC 6420890 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
895 : :
6167 tgl@sss.pgh.pa.us 896 [ + + + + : 6420890 : pgstat_count_index_scan(rel);
+ + ]
897 : :
898 : : /*
899 : : * Examine the scan keys and eliminate any redundant keys; also mark the
900 : : * keys that must be matched to continue the scan.
901 : : */
7459 902 : 6420890 : _bt_preprocess_keys(scan);
903 : :
904 : : /*
905 : : * Quit now if _bt_preprocess_keys() discovered that the scan keys can
906 : : * never be satisfied (eg, x == 1 AND x > 2).
907 : : */
8424 bruce@momjian.us 908 [ + + ]: 6420890 : if (!so->qual_ok)
909 : : {
1305 akapila@postgresql.o 910 : 1387 : _bt_parallel_done(scan);
8000 tgl@sss.pgh.pa.us 911 : 1387 : return false;
912 : : }
913 : :
914 : : /*
915 : : * For parallel scans, get the starting page from shared state. If the
916 : : * scan has not started, proceed to find out first leaf page in the usual
917 : : * way while keeping other participating processes waiting. If the scan
918 : : * has already begun, use the page number from the shared structure.
919 : : *
920 : : * When a parallel scan has another primitive index scan scheduled, a
921 : : * parallel worker will seize the scan for that purpose now. This is
922 : : * similar to the case where the top-level scan hasn't started.
923 : : */
2615 rhaas@postgresql.org 924 [ + + ]: 6419503 : if (scan->parallel_scan != NULL)
925 : : {
8 pg@bowt.ie 926 :GNC 217 : status = _bt_parallel_seize(scan, &blkno, true);
927 : :
928 : : /*
929 : : * Initialize arrays (when _bt_parallel_seize didn't already set up
930 : : * the next primitive index scan)
931 : : */
932 [ + + + + ]: 217 : if (so->numArrayKeys && !so->needPrimScan)
933 : 15 : _bt_start_array_keys(scan, dir);
934 : :
2615 rhaas@postgresql.org 935 [ + + ]:CBC 217 : if (!status)
936 : 29 : return false;
937 [ + + ]: 188 : else if (blkno == P_NONE)
938 : : {
939 : 5 : _bt_parallel_done(scan);
940 : 5 : return false;
941 : : }
942 [ + + ]: 183 : else if (blkno != InvalidBlockNumber)
943 : : {
944 [ - + ]: 124 : if (!_bt_parallel_readpage(scan, blkno, dir))
2615 rhaas@postgresql.org 945 :UBC 0 : return false;
2615 rhaas@postgresql.org 946 :CBC 124 : goto readcomplete;
947 : : }
948 : : }
8 pg@bowt.ie 949 [ + + + + ]:GNC 6419286 : else if (so->numArrayKeys && !so->needPrimScan)
950 : : {
951 : : /*
952 : : * First _bt_first call (for current btrescan) without parallelism.
953 : : *
954 : : * Initialize arrays, and the corresponding scan keys that were just
955 : : * output by _bt_preprocess_keys.
956 : : */
957 : 380 : _bt_start_array_keys(scan, dir);
958 : : }
959 : :
960 : : /*----------
961 : : * Examine the scan keys to discover where we need to start the scan.
962 : : *
963 : : * We want to identify the keys that can be used as starting boundaries;
964 : : * these are =, >, or >= keys for a forward scan or =, <, <= keys for
965 : : * a backwards scan. We can use keys for multiple attributes so long as
966 : : * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
967 : : * a > or < boundary or find an attribute with no boundary (which can be
968 : : * thought of as the same as "> -infinity"), we can't use keys for any
969 : : * attributes to its right, because it would break our simplistic notion
970 : : * of what initial positioning strategy to use.
971 : : *
972 : : * When the scan keys include cross-type operators, _bt_preprocess_keys
973 : : * may not be able to eliminate redundant keys; in such cases we will
974 : : * arbitrarily pick a usable one for each attribute. This is correct
975 : : * but possibly not optimal behavior. (For example, with keys like
976 : : * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
977 : : * x=5 would be more efficient.) Since the situation only arises given
978 : : * a poorly-worded query plus an incomplete opfamily, live with it.
979 : : *
980 : : * When both equality and inequality keys appear for a single attribute
981 : : * (again, only possible when cross-type operators appear), we *must*
982 : : * select one of the equality keys for the starting point, because
983 : : * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
984 : : * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
985 : : * start at x=4, we will fail and stop before reaching x=10. If multiple
986 : : * equality quals survive preprocessing, however, it doesn't matter which
987 : : * one we use --- by definition, they are either redundant or
988 : : * contradictory.
989 : : *
990 : : * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
991 : : * If the index stores nulls at the end of the index we'll be starting
992 : : * from, and we have no boundary key for the column (which means the key
993 : : * we deduced NOT NULL from is an inequality key that constrains the other
994 : : * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
995 : : * use as a boundary key. If we didn't do this, we might find ourselves
996 : : * traversing a lot of null entries at the start of the scan.
997 : : *
998 : : * In this loop, row-comparison keys are treated the same as keys on their
999 : : * first (leftmost) columns. We'll add on lower-order columns of the row
1000 : : * comparison below, if possible.
1001 : : *
1002 : : * The selected scan keys (at most one per index column) are remembered by
1003 : : * storing their addresses into the local startKeys[] array.
1004 : : *
1005 : : * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
1006 : : * the next primitive index scan (for scans with array keys) based in part
1007 : : * on an understanding of how it'll enable us to reposition the scan.
1008 : : * They're directly aware of how we'll sometimes cons up an explicit
1009 : : * SK_SEARCHNOTNULL key. They'll even end primitive scans by applying a
1010 : : * symmetric "deduce NOT NULL" rule of their own. This allows top-level
1011 : : * scans to skip large groups of NULLs through repeated deductions about
1012 : : * key strictness (for a required inequality key) and whether NULLs in the
1013 : : * key's index column are stored last or first (relative to non-NULLs).
1014 : : * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
1015 : : * need to be kept in sync.
1016 : : *----------
1017 : : */
8966 bruce@momjian.us 1018 :CBC 6419345 : strat_total = BTEqualStrategyNumber;
8664 tgl@sss.pgh.pa.us 1019 [ + + ]: 6419345 : if (so->numberOfKeys > 0)
1020 : : {
1021 : : AttrNumber curattr;
1022 : : ScanKey chosen;
1023 : : ScanKey impliesNN;
1024 : : ScanKey cur;
1025 : :
1026 : : /*
1027 : : * chosen is the so-far-chosen key for the current attribute, if any.
1028 : : * We don't cast the decision in stone until we reach keys for the
1029 : : * next attribute.
1030 : : */
7459 1031 : 6414033 : curattr = 1;
1032 : 6414033 : chosen = NULL;
1033 : : /* Also remember any scankey that implies a NOT NULL constraint */
4547 1034 : 6414033 : impliesNN = NULL;
1035 : :
1036 : : /*
1037 : : * Loop iterates from 0 to numberOfKeys inclusive; we use the last
1038 : : * pass to handle after-last-key processing. Actual exit from the
1039 : : * loop is at one of the "break" statements below.
1040 : : */
7459 1041 : 16769620 : for (cur = so->keyData, i = 0;; cur++, i++)
1042 : : {
1043 [ + + + + ]: 16769620 : if (i >= so->numberOfKeys || cur->sk_attno != curattr)
1044 : : {
1045 : : /*
1046 : : * Done looking at keys for curattr. If we didn't find a
1047 : : * usable boundary key, see if we can deduce a NOT NULL key.
1048 : : */
4547 1049 [ + + + + : 10388190 : if (chosen == NULL && impliesNN != NULL &&
+ + ]
1050 [ + + ]: 32177 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1051 : : ScanDirectionIsForward(dir) :
1052 : : ScanDirectionIsBackward(dir)))
1053 : : {
1054 : : /* Yes, so build the key in notnullkeys[keysCount] */
128 pg@bowt.ie 1055 :GNC 3 : chosen = ¬nullkeys[keysz];
4547 tgl@sss.pgh.pa.us 1056 :UBC 0 : ScanKeyEntryInitialize(chosen,
1057 : : (SK_SEARCHNOTNULL | SK_ISNULL |
4547 tgl@sss.pgh.pa.us 1058 :CBC 3 : (impliesNN->sk_flags &
1059 : : (SK_BT_DESC | SK_BT_NULLS_FIRST))),
1060 : : curattr,
2489 1061 [ - + ]: 3 : ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
1062 : : BTGreaterStrategyNumber :
1063 : : BTLessStrategyNumber),
1064 : : InvalidOid,
1065 : : InvalidOid,
1066 : : InvalidOid,
1067 : : (Datum) 0);
1068 : : }
1069 : :
1070 : : /*
1071 : : * If we still didn't find a usable boundary key, quit; else
1072 : : * save the boundary key pointer in startKeys.
1073 : : */
7459 1074 [ + + ]: 10356013 : if (chosen == NULL)
1075 : 33515 : break;
128 pg@bowt.ie 1076 :GNC 10322498 : startKeys[keysz++] = chosen;
1077 : :
1078 : : /*
1079 : : * Adjust strat_total, and quit if we have stored a > or <
1080 : : * key.
1081 : : */
7459 tgl@sss.pgh.pa.us 1082 :CBC 10322498 : strat = chosen->sk_strategy;
1083 [ + + ]: 10322498 : if (strat != BTEqualStrategyNumber)
1084 : : {
1085 : 670748 : strat_total = strat;
1086 [ + + + + ]: 670748 : if (strat == BTGreaterStrategyNumber ||
1087 : : strat == BTLessStrategyNumber)
1088 : : break;
1089 : : }
1090 : :
1091 : : /*
1092 : : * Done if that was the last attribute, or if next key is not
1093 : : * in sequence (implying no boundary key is available for the
1094 : : * next attribute).
1095 : : */
6880 1096 [ + + ]: 9653919 : if (i >= so->numberOfKeys ||
1097 [ + + ]: 3942228 : cur->sk_attno != curattr + 1)
1098 : : break;
1099 : :
1100 : : /*
1101 : : * Reset for next attr.
1102 : : */
7459 1103 : 3941980 : curattr = cur->sk_attno;
1104 : 3941980 : chosen = NULL;
4547 1105 : 3941980 : impliesNN = NULL;
1106 : : }
1107 : :
1108 : : /*
1109 : : * Can we use this key as a starting boundary for this attr?
1110 : : *
1111 : : * If not, does it imply a NOT NULL constraint? (Because
1112 : : * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
1113 : : * *any* inequality key works for that; we need not test.)
1114 : : */
7459 1115 [ + + + - ]: 10355587 : switch (cur->sk_strategy)
1116 : : {
1117 : 56919 : case BTLessStrategyNumber:
1118 : : case BTLessEqualStrategyNumber:
4547 1119 [ + + ]: 56919 : if (chosen == NULL)
1120 : : {
1121 [ + + ]: 56010 : if (ScanDirectionIsBackward(dir))
1122 : 23842 : chosen = cur;
1123 : : else
1124 : 32168 : impliesNN = cur;
1125 : : }
7459 1126 : 56919 : break;
1127 : 9651750 : case BTEqualStrategyNumber:
1128 : : /* override any non-equality choice */
1129 : 9651750 : chosen = cur;
1130 : 9651750 : break;
1131 : 646918 : case BTGreaterEqualStrategyNumber:
1132 : : case BTGreaterStrategyNumber:
4547 1133 [ + - ]: 646918 : if (chosen == NULL)
1134 : : {
1135 [ + + ]: 646918 : if (ScanDirectionIsForward(dir))
1136 : 646903 : chosen = cur;
1137 : : else
1138 : 15 : impliesNN = cur;
1139 : : }
9133 bruce@momjian.us 1140 : 646918 : break;
1141 : : }
1142 : : }
1143 : : }
1144 : :
1145 : : /*
1146 : : * If we found no usable boundary keys, we have to start from one end of
1147 : : * the tree. Walk down that edge to the first or last key, and scan from
1148 : : * there.
1149 : : */
128 pg@bowt.ie 1150 [ + + ]:GNC 6419345 : if (keysz == 0)
1151 : : {
1152 : : bool match;
1153 : :
2615 rhaas@postgresql.org 1154 :CBC 38751 : match = _bt_endpoint(scan, dir);
1155 : :
1156 [ + + ]: 38751 : if (!match)
1157 : : {
1158 : : /* No match, so mark (parallel) scan finished */
1159 : 3944 : _bt_parallel_done(scan);
1160 : : }
1161 : :
1162 : 38751 : return match;
1163 : : }
1164 : :
1165 : : /*
1166 : : * We want to start the scan somewhere within the index. Set up an
1167 : : * insertion scankey we can use to search for the boundary point we
1168 : : * identified above. The insertion scankey is built using the keys
1169 : : * identified by startKeys[]. (Remaining insertion scankey fields are
1170 : : * initialized after initial-positioning strategy is finalized.)
1171 : : */
128 pg@bowt.ie 1172 [ - + ]:GNC 6380594 : Assert(keysz <= INDEX_MAX_KEYS);
1173 [ + + ]: 16703080 : for (i = 0; i < keysz; i++)
1174 : : {
7459 tgl@sss.pgh.pa.us 1175 :CBC 10322498 : ScanKey cur = startKeys[i];
1176 : :
6402 bruce@momjian.us 1177 [ - + ]: 10322498 : Assert(cur->sk_attno == i + 1);
1178 : :
6654 tgl@sss.pgh.pa.us 1179 [ + + ]: 10322498 : if (cur->sk_flags & SK_ROW_HEADER)
1180 : : {
1181 : : /*
1182 : : * Row comparison header: look to the first row member instead.
1183 : : *
1184 : : * The member scankeys are already in insertion format (ie, they
1185 : : * have sk_func = 3-way-comparison function), but we have to watch
1186 : : * out for nulls, which _bt_preprocess_keys didn't check. A null
1187 : : * in the first row member makes the condition unmatchable, just
1188 : : * like qual_ok = false.
1189 : : */
6305 1190 : 12 : ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
1191 : :
1192 [ - + ]: 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
1193 [ - + ]: 12 : if (subkey->sk_flags & SK_ISNULL)
1194 : : {
2615 rhaas@postgresql.org 1195 :UBC 0 : _bt_parallel_done(scan);
6654 tgl@sss.pgh.pa.us 1196 : 0 : return false;
1197 : : }
1852 pg@bowt.ie 1198 :CBC 12 : memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData));
1199 : :
1200 : : /*
1201 : : * If the row comparison is the last positioning key we accepted,
1202 : : * try to add additional keys from the lower-order row members.
1203 : : * (If we accepted independent conditions on additional index
1204 : : * columns, we use those instead --- doesn't seem worth trying to
1205 : : * determine which is more restrictive.) Note that this is OK
1206 : : * even if the row comparison is of ">" or "<" type, because the
1207 : : * condition applied to all but the last row member is effectively
1208 : : * ">=" or "<=", and so the extra keys don't break the positioning
1209 : : * scheme. But, by the same token, if we aren't able to use all
1210 : : * the row members, then the part of the row comparison that we
1211 : : * did use has to be treated as just a ">=" or "<=" condition, and
1212 : : * so we'd better adjust strat_total accordingly.
1213 : : */
128 pg@bowt.ie 1214 [ + - ]:GNC 12 : if (i == keysz - 1)
1215 : : {
6305 tgl@sss.pgh.pa.us 1216 :CBC 12 : bool used_all_subkeys = false;
1217 : :
1218 [ - + ]: 12 : Assert(!(subkey->sk_flags & SK_ROW_END));
1219 : : for (;;)
1220 : : {
1221 : 12 : subkey++;
1222 [ - + ]: 12 : Assert(subkey->sk_flags & SK_ROW_MEMBER);
128 pg@bowt.ie 1223 [ - + ]:GNC 12 : if (subkey->sk_attno != keysz + 1)
6654 tgl@sss.pgh.pa.us 1224 :UBC 0 : break; /* out-of-sequence, can't use it */
6305 tgl@sss.pgh.pa.us 1225 [ - + ]:CBC 12 : if (subkey->sk_strategy != cur->sk_strategy)
6305 tgl@sss.pgh.pa.us 1226 :UBC 0 : break; /* wrong direction, can't use it */
6305 tgl@sss.pgh.pa.us 1227 [ - + ]:CBC 12 : if (subkey->sk_flags & SK_ISNULL)
6654 tgl@sss.pgh.pa.us 1228 :UBC 0 : break; /* can't use null keys */
128 pg@bowt.ie 1229 [ - + ]:GNC 12 : Assert(keysz < INDEX_MAX_KEYS);
1230 : 12 : memcpy(inskey.scankeys + keysz, subkey,
1231 : : sizeof(ScanKeyData));
1232 : 12 : keysz++;
6305 tgl@sss.pgh.pa.us 1233 [ + - ]:CBC 12 : if (subkey->sk_flags & SK_ROW_END)
1234 : : {
1235 : 12 : used_all_subkeys = true;
1236 : 12 : break;
1237 : : }
1238 : : }
1239 [ - + ]: 12 : if (!used_all_subkeys)
1240 : : {
6305 tgl@sss.pgh.pa.us 1241 [ # # # ]:UBC 0 : switch (strat_total)
1242 : : {
1243 : 0 : case BTLessStrategyNumber:
1244 : 0 : strat_total = BTLessEqualStrategyNumber;
1245 : 0 : break;
1246 : 0 : case BTGreaterStrategyNumber:
1247 : 0 : strat_total = BTGreaterEqualStrategyNumber;
1248 : 0 : break;
1249 : : }
1250 : : }
6654 tgl@sss.pgh.pa.us 1251 :CBC 12 : break; /* done with outer loop */
1252 : : }
1253 : : }
1254 : : else
1255 : : {
1256 : : /*
1257 : : * Ordinary comparison key. Transform the search-style scan key
1258 : : * to an insertion scan key by replacing the sk_func with the
1259 : : * appropriate btree comparison function.
1260 : : *
1261 : : * If scankey operator is not a cross-type comparison, we can use
1262 : : * the cached comparison function; otherwise gotta look it up in
1263 : : * the catalogs. (That can't lead to infinite recursion, since no
1264 : : * indexscan initiated by syscache lookup will use cross-data-type
1265 : : * operators.)
1266 : : *
1267 : : * We support the convention that sk_subtype == InvalidOid means
1268 : : * the opclass input type; this is a hack to simplify life for
1269 : : * ScanKeyInit().
1270 : : */
6322 1271 [ + + ]: 10322486 : if (cur->sk_subtype == rel->rd_opcintype[i] ||
1272 [ + + ]: 10021851 : cur->sk_subtype == InvalidOid)
6654 1273 : 10315706 : {
1274 : : FmgrInfo *procinfo;
1275 : :
1276 : 10315706 : procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
1852 pg@bowt.ie 1277 : 10315706 : ScanKeyEntryInitializeWithInfo(inskey.scankeys + i,
1278 : : cur->sk_flags,
6654 tgl@sss.pgh.pa.us 1279 : 10315706 : cur->sk_attno,
1280 : : InvalidStrategy,
1281 : : cur->sk_subtype,
1282 : : cur->sk_collation,
1283 : : procinfo,
1284 : : cur->sk_argument);
1285 : : }
1286 : : else
1287 : : {
1288 : : RegProcedure cmp_proc;
1289 : :
6322 1290 : 6780 : cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
1291 : 6780 : rel->rd_opcintype[i],
1292 : : cur->sk_subtype,
1293 : : BTORDER_PROC);
1294 [ - + ]: 6780 : if (!RegProcedureIsValid(cmp_proc))
6322 tgl@sss.pgh.pa.us 1295 [ # # ]:UBC 0 : elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
1296 : : BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
1297 : : cur->sk_attno, RelationGetRelationName(rel));
1852 pg@bowt.ie 1298 :CBC 6780 : ScanKeyEntryInitialize(inskey.scankeys + i,
1299 : : cur->sk_flags,
6654 tgl@sss.pgh.pa.us 1300 : 6780 : cur->sk_attno,
1301 : : InvalidStrategy,
1302 : : cur->sk_subtype,
1303 : : cur->sk_collation,
1304 : : cmp_proc,
1305 : : cur->sk_argument);
1306 : : }
1307 : : }
1308 : : }
1309 : :
1310 : : /*----------
1311 : : * Examine the selected initial-positioning strategy to determine exactly
1312 : : * where we need to start the scan, and set flag variables to control the
1313 : : * initial descent by _bt_search (and our _bt_binsrch call for the leaf
1314 : : * page _bt_search returns).
1315 : : *----------
1316 : : */
128 pg@bowt.ie 1317 :GNC 6380594 : _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage);
1318 : 6380594 : inskey.anynullkeys = false; /* unused */
1319 : 6380594 : inskey.scantid = NULL;
1320 : 6380594 : inskey.keysz = keysz;
7420 tgl@sss.pgh.pa.us 1321 [ + + + + :CBC 6380594 : switch (strat_total)
+ - ]
1322 : : {
1323 : 23842 : case BTLessStrategyNumber:
1324 : :
128 pg@bowt.ie 1325 :GNC 23842 : inskey.nextkey = false;
1326 : 23842 : inskey.backward = true;
7420 tgl@sss.pgh.pa.us 1327 :CBC 23842 : break;
1328 : :
7420 tgl@sss.pgh.pa.us 1329 :GBC 3 : case BTLessEqualStrategyNumber:
1330 : :
128 pg@bowt.ie 1331 :GNC 3 : inskey.nextkey = true;
1332 : 3 : inskey.backward = true;
7420 tgl@sss.pgh.pa.us 1333 :GBC 3 : break;
1334 : :
7420 tgl@sss.pgh.pa.us 1335 :CBC 5709846 : case BTEqualStrategyNumber:
1336 : :
1337 : : /*
1338 : : * If a backward scan was specified, need to start with last equal
1339 : : * item not first one.
1340 : : */
1341 [ + + ]: 5709846 : if (ScanDirectionIsBackward(dir))
1342 : : {
1343 : : /*
1344 : : * This is the same as the <= strategy
1345 : : */
128 pg@bowt.ie 1346 :GNC 74 : inskey.nextkey = true;
1347 : 74 : inskey.backward = true;
1348 : : }
1349 : : else
1350 : : {
1351 : : /*
1352 : : * This is the same as the >= strategy
1353 : : */
1354 : 5709772 : inskey.nextkey = false;
1355 : 5709772 : inskey.backward = false;
1356 : : }
7420 tgl@sss.pgh.pa.us 1357 :CBC 5709846 : break;
1358 : :
1359 : 2166 : case BTGreaterEqualStrategyNumber:
1360 : :
1361 : : /*
1362 : : * Find first item >= scankey
1363 : : */
128 pg@bowt.ie 1364 :GNC 2166 : inskey.nextkey = false;
1365 : 2166 : inskey.backward = false;
7420 tgl@sss.pgh.pa.us 1366 :CBC 2166 : break;
1367 : :
1368 : 644737 : case BTGreaterStrategyNumber:
1369 : :
1370 : : /*
1371 : : * Find first item > scankey
1372 : : */
128 pg@bowt.ie 1373 :GNC 644737 : inskey.nextkey = true;
1374 : 644737 : inskey.backward = false;
7420 tgl@sss.pgh.pa.us 1375 :CBC 644737 : break;
1376 : :
7420 tgl@sss.pgh.pa.us 1377 :UBC 0 : default:
1378 : : /* can't get here, but keep compiler quiet */
1379 [ # # ]: 0 : elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
1380 : : return false;
1381 : : }
1382 : :
1383 : : /*
1384 : : * Use the manufactured insertion scan key to descend the tree and
1385 : : * position ourselves on the target leaf page.
1386 : : */
128 pg@bowt.ie 1387 [ - + ]:GNC 6380594 : Assert(ScanDirectionIsBackward(dir) == inskey.backward);
219 tmunro@postgresql.or 1388 : 6380594 : stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
1389 : :
1390 : : /* don't need to keep the stack around... */
8668 tgl@sss.pgh.pa.us 1391 :CBC 6380593 : _bt_freestack(stack);
1392 : :
8424 bruce@momjian.us 1393 [ + + ]: 6380593 : if (!BufferIsValid(buf))
1394 : : {
1395 : : /*
1396 : : * We only get here if the index is completely empty. Lock relation
1397 : : * because nothing finer to lock exists. Without a buffer lock, it's
1398 : : * possible for another transaction to insert data between
1399 : : * _bt_search() and PredicateLockRelation(). We have to try again
1400 : : * after taking the relation-level predicate lock, to close a narrow
1401 : : * window where we wouldn't scan concurrently inserted tuples, but the
1402 : : * writer wouldn't see our predicate lock.
1403 : : */
286 tmunro@postgresql.or 1404 [ + + ]: 241935 : if (IsolationIsSerializable())
1405 : : {
1406 : 2844 : PredicateLockRelation(rel, scan->xs_snapshot);
219 tmunro@postgresql.or 1407 :GNC 2844 : stack = _bt_search(rel, NULL, &inskey, &buf, BT_READ);
286 tmunro@postgresql.or 1408 :CBC 2844 : _bt_freestack(stack);
1409 : : }
1410 : :
1411 [ + - ]: 241935 : if (!BufferIsValid(buf))
1412 : : {
1413 : : /*
1414 : : * Mark parallel scan as done, so that all the workers can finish
1415 : : * their scan.
1416 : : */
1417 : 241935 : _bt_parallel_done(scan);
1418 : 241935 : BTScanPosInvalidate(so->currPos);
1419 : 241935 : return false;
1420 : : }
1421 : : }
1422 : :
1423 : 6138658 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
1424 : :
2615 rhaas@postgresql.org 1425 : 6138658 : _bt_initialize_more_data(so, dir);
1426 : :
1427 : : /* position to the precise item on the page */
1852 pg@bowt.ie 1428 : 6138658 : offnum = _bt_binsrch(rel, &inskey, buf);
3308 kgrittn@postgresql.o 1429 [ + - - + : 6138658 : Assert(!BTScanPosIsValid(so->currPos));
- + ]
1430 : 6138658 : so->currPos.buf = buf;
1431 : :
1432 : : /*
1433 : : * Now load data from the first page of the scan.
1434 : : *
1435 : : * If inskey.nextkey = false and inskey.backward = false, offnum is
1436 : : * positioned at the first non-pivot tuple >= inskey.scankeys.
1437 : : *
1438 : : * If inskey.nextkey = false and inskey.backward = true, offnum is
1439 : : * positioned at the last non-pivot tuple < inskey.scankeys.
1440 : : *
1441 : : * If inskey.nextkey = true and inskey.backward = false, offnum is
1442 : : * positioned at the first non-pivot tuple > inskey.scankeys.
1443 : : *
1444 : : * If inskey.nextkey = true and inskey.backward = true, offnum is
1445 : : * positioned at the last non-pivot tuple <= inskey.scankeys.
1446 : : *
1447 : : * It's possible that _bt_binsrch returned an offnum that is out of bounds
1448 : : * for the page. For example, when inskey is both < the leaf page's high
1449 : : * key and > all of its non-pivot tuples, offnum will be "maxoff + 1".
1450 : : */
109 akorotkov@postgresql 1451 [ + + ]:GNC 6138658 : if (!_bt_readpage(scan, dir, offnum, true))
1452 : : {
1453 : : /*
1454 : : * There's no actually-matching data on this page. Try to advance to
1455 : : * the next page. Return false if there's no matching data at all.
1456 : : */
1363 pg@bowt.ie 1457 :CBC 1665691 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
6552 tgl@sss.pgh.pa.us 1458 [ + + ]: 1665691 : if (!_bt_steppage(scan, dir))
7420 1459 : 1664579 : return false;
1460 : : }
1461 : : else
1462 : : {
1463 : : /* We have at least one item to return as scan's first item */
3308 kgrittn@postgresql.o 1464 : 4472967 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
1465 : : }
1466 : :
2615 rhaas@postgresql.org 1467 : 4474203 : readcomplete:
1468 : : /* OK, itemIndex says what to return */
4571 tgl@sss.pgh.pa.us 1469 : 4474203 : currItem = &so->currPos.items[so->currPos.itemIndex];
1861 andres@anarazel.de 1470 : 4474203 : scan->xs_heaptid = currItem->heapTid;
4571 tgl@sss.pgh.pa.us 1471 [ + + ]: 4474203 : if (scan->xs_want_itup)
1472 : 65669 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1473 : :
6552 1474 : 4474203 : return true;
1475 : : }
1476 : :
1477 : : /*
1478 : : * _bt_next() -- Get the next item in a scan.
1479 : : *
1480 : : * On entry, so->currPos describes the current page, which may be pinned
1481 : : * but is not locked, and so->currPos.itemIndex identifies which item was
1482 : : * previously returned.
1483 : : *
1484 : : * On successful exit, scan->xs_heaptid is set to the TID of the next
1485 : : * heap tuple, and if requested, scan->xs_itup points to a copy of the
1486 : : * index tuple. so->currPos is updated as needed.
1487 : : *
1488 : : * On failure exit (no more tuples), we release pin and set
1489 : : * so->currPos.buf to InvalidBuffer.
1490 : : */
1491 : : bool
1492 : 8131620 : _bt_next(IndexScanDesc scan, ScanDirection dir)
1493 : : {
1494 : 8131620 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1495 : : BTScanPosItem *currItem;
1496 : :
1497 : : /*
1498 : : * Advance to next tuple on current page; or if there's no more, try to
1499 : : * step to the next page with data.
1500 : : */
1501 [ + + ]: 8131620 : if (ScanDirectionIsForward(dir))
1502 : : {
1503 [ + + ]: 8120021 : if (++so->currPos.itemIndex > so->currPos.lastItem)
1504 : : {
1505 [ + + ]: 1022529 : if (!_bt_steppage(scan, dir))
1506 : 1006805 : return false;
1507 : : }
1508 : : }
1509 : : else
1510 : : {
1511 [ + + ]: 11599 : if (--so->currPos.itemIndex < so->currPos.firstItem)
1512 : : {
1513 [ + + ]: 63 : if (!_bt_steppage(scan, dir))
8000 1514 : 40 : return false;
1515 : : }
1516 : : }
1517 : :
1518 : : /* OK, itemIndex says what to return */
4571 1519 : 7124775 : currItem = &so->currPos.items[so->currPos.itemIndex];
1861 andres@anarazel.de 1520 : 7124775 : scan->xs_heaptid = currItem->heapTid;
4571 tgl@sss.pgh.pa.us 1521 [ + + ]: 7124775 : if (scan->xs_want_itup)
1522 : 1949872 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
1523 : :
6552 1524 : 7124775 : return true;
1525 : : }
1526 : :
1527 : : /*
1528 : : * _bt_readpage() -- Load data from current index page into so->currPos
1529 : : *
1530 : : * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
1531 : : * is not changed here. Also, currPos.moreLeft and moreRight must be valid;
1532 : : * they are updated as appropriate. All other fields of so->currPos are
1533 : : * initialized from scratch here.
1534 : : *
1535 : : * We scan the current page starting at offnum and moving in the indicated
1536 : : * direction. All items matching the scan keys are loaded into currPos.items.
1537 : : * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
1538 : : * that there can be no more matching tuples in the current scan direction
1539 : : * (could just be for the current primitive index scan when scan has arrays).
1540 : : *
1541 : : * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
1542 : : * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
1543 : : * _bt_checkkeys will stop the scan as soon as an equality qual fails (when
1544 : : * its scan key was marked required), so _bt_first _must_ pass us an offnum
1545 : : * exactly at the beginning of where equal tuples are to be found. When we're
1546 : : * passed an offnum past the end of the page, we might still manage to stop
1547 : : * the scan on this page by calling _bt_checkkeys against the high key.
1548 : : *
1549 : : * In the case of a parallel scan, caller must have called _bt_parallel_seize
1550 : : * prior to calling this function; this function will invoke
1551 : : * _bt_parallel_release before returning.
1552 : : *
1553 : : * Returns true if any matching items found on the page, false if none.
1554 : : */
1555 : : static bool
109 akorotkov@postgresql 1556 :GNC 6192072 : _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
1557 : : bool firstPage)
1558 : : {
6552 tgl@sss.pgh.pa.us 1559 :CBC 6192072 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
1560 : : Page page;
1561 : : BTPageOpaque opaque;
1562 : : OffsetNumber minoff;
1563 : : OffsetNumber maxoff;
1564 : : BTReadPageState pstate;
1565 : : bool arrayKeys;
1566 : : int itemIndex,
1567 : : indnatts;
1568 : :
1569 : : /*
1570 : : * We must have the buffer pinned and locked, but the usual macro can't be
1571 : : * used here; this function is what makes it good for currPos.
1572 : : */
1573 [ - + ]: 6192072 : Assert(BufferIsValid(so->currPos.buf));
1574 : :
2916 kgrittn@postgresql.o 1575 : 6192072 : page = BufferGetPage(so->currPos.buf);
744 michael@paquier.xyz 1576 : 6192072 : opaque = BTPageGetOpaque(page);
1577 : :
1578 : : /* allow next page be processed by parallel worker */
2615 rhaas@postgresql.org 1579 [ + + ]: 6192072 : if (scan->parallel_scan)
1580 : : {
1581 [ + - ]: 665 : if (ScanDirectionIsForward(dir))
8 pg@bowt.ie 1582 :GNC 665 : pstate.prev_scan_page = opaque->btpo_next;
1583 : : else
8 pg@bowt.ie 1584 :UNC 0 : pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
1585 : :
8 pg@bowt.ie 1586 :GNC 665 : _bt_parallel_release(scan, pstate.prev_scan_page);
1587 : : }
1588 : :
1849 pg@bowt.ie 1589 :CBC 6192072 : indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
8 pg@bowt.ie 1590 :GNC 6192072 : arrayKeys = so->numArrayKeys != 0;
6552 tgl@sss.pgh.pa.us 1591 [ + + ]:CBC 6192072 : minoff = P_FIRSTDATAKEY(opaque);
1592 : 6192072 : maxoff = PageGetMaxOffsetNumber(page);
1593 : :
1594 : : /* initialize page-level state that we'll pass to _bt_checkkeys */
8 pg@bowt.ie 1595 :GNC 6192072 : pstate.dir = dir;
1596 : 6192072 : pstate.minoff = minoff;
1597 : 6192072 : pstate.maxoff = maxoff;
1598 : 6192072 : pstate.finaltup = NULL;
1599 : 6192072 : pstate.page = page;
1600 : 6192072 : pstate.offnum = InvalidOffsetNumber;
1601 : 6192072 : pstate.skip = InvalidOffsetNumber;
1602 : 6192072 : pstate.continuescan = true; /* default assumption */
1603 : 6192072 : pstate.prechecked = false;
1604 : 6192072 : pstate.firstmatch = false;
1605 : 6192072 : pstate.rechecks = 0;
1606 : 6192072 : pstate.targetdistance = 0;
1607 : :
1608 : : /*
1609 : : * We note the buffer's block number so that we can release the pin later.
1610 : : * This allows us to re-read the buffer if it is needed again for hinting.
1611 : : */
3308 kgrittn@postgresql.o 1612 :CBC 6192072 : so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
1613 : :
1614 : : /*
1615 : : * We save the LSN of the page as we read it, so that we know whether it
1616 : : * safe to apply LP_DEAD hints to the page later. This allows us to drop
1617 : : * the pin for MVCC scans, which allows vacuum to avoid blocking.
1618 : : */
2287 alvherre@alvh.no-ip. 1619 : 6192072 : so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
1620 : :
1621 : : /*
1622 : : * we must save the page's right-link while scanning it; this tells us
1623 : : * where to step right to after we're done with these items. There is no
1624 : : * corresponding need for the left-link, since splits always go right.
1625 : : */
6552 tgl@sss.pgh.pa.us 1626 : 6192072 : so->currPos.nextPage = opaque->btpo_next;
1627 : :
1628 : : /* initialize tuple workspace to empty */
4571 1629 : 6192072 : so->currPos.nextTupleOffset = 0;
1630 : :
1631 : : /*
1632 : : * Now that the current page has been made consistent, the macro should be
1633 : : * good.
1634 : : */
3308 kgrittn@postgresql.o 1635 [ - + - - : 6192072 : Assert(BTScanPosIsPinned(so->currPos));
- + ]
1636 : :
1637 : : /*
1638 : : * Prechecking the value of the continuescan flag for the last item on the
1639 : : * page (for backwards scan it will be the first item on a page). If we
1640 : : * observe it to be true, then it should be true for all other items. This
1641 : : * allows us to do significant optimizations in the _bt_checkkeys()
1642 : : * function for all the items on the page.
1643 : : *
1644 : : * With the forward scan, we do this check for the last item on the page
1645 : : * instead of the high key. It's relatively likely that the most
1646 : : * significant column in the high key will be different from the
1647 : : * corresponding value from the last item on the page. So checking with
1648 : : * the last item on the page would give a more precise answer.
1649 : : *
1650 : : * We skip this for the first page read by each (primitive) scan, to avoid
1651 : : * slowing down point queries. They typically don't stand to gain much
1652 : : * when the optimization can be applied, and are more likely to notice the
1653 : : * overhead of the precheck.
1654 : : *
1655 : : * The optimization is unsafe and must be avoided whenever _bt_checkkeys
1656 : : * just set a low-order required array's key to the best available match
1657 : : * for a truncated -inf attribute value from the prior page's high key
1658 : : * (array element 0 is always the best available match in this scenario).
1659 : : * It's quite likely that matches for array element 0 begin on this page,
1660 : : * but the start of matches won't necessarily align with page boundaries.
1661 : : * When the start of matches is somewhere in the middle of this page, it
1662 : : * would be wrong to treat page's final non-pivot tuple as representative.
1663 : : * Doing so might lead us to treat some of the page's earlier tuples as
1664 : : * being part of a group of tuples thought to satisfy the required keys.
1665 : : *
1666 : : * Note: Conversely, in the case where the scan's arrays just advanced
1667 : : * using the prior page's HIKEY _without_ advancement setting scanBehind,
1668 : : * the start of matches must be aligned with page boundaries, which makes
1669 : : * it safe to attempt the optimization here now. It's also safe when the
1670 : : * prior page's HIKEY simply didn't need to advance any required array. In
1671 : : * both cases we can safely assume that the _first_ tuple from this page
1672 : : * must be >= the current set of array keys/equality constraints. And so
1673 : : * if the final tuple is == those same keys (and also satisfies any
1674 : : * required < or <= strategy scan keys) during the precheck, we can safely
1675 : : * assume that this must also be true of all earlier tuples from the page.
1676 : : */
8 pg@bowt.ie 1677 [ + + + + :GNC 6192072 : if (!firstPage && !so->scanBehind && minoff < maxoff)
+ + ]
1678 : : {
1679 : : ItemId iid;
1680 : : IndexTuple itup;
1681 : :
191 akorotkov@postgresql 1682 [ + + ]: 17796 : iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
1683 : 17796 : itup = (IndexTuple) PageGetItem(page, iid);
1684 : :
1685 : : /* Call with arrayKeys=false to avoid undesirable side-effects */
8 pg@bowt.ie 1686 : 17796 : _bt_checkkeys(scan, &pstate, false, itup, indnatts);
1687 : 17796 : pstate.prechecked = pstate.continuescan;
1688 : 17796 : pstate.continuescan = true; /* reset */
1689 : : }
1690 : :
6552 tgl@sss.pgh.pa.us 1691 [ + + ]:CBC 6192072 : if (ScanDirectionIsForward(dir))
1692 : : {
1693 : : /* SK_SEARCHARRAY forward scans must provide high key up front */
8 pg@bowt.ie 1694 [ + + + + ]:GNC 6166995 : if (arrayKeys && !P_RIGHTMOST(opaque))
1695 : : {
1696 : 550 : ItemId iid = PageGetItemId(page, P_HIKEY);
1697 : :
1698 : 550 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1699 : : }
1700 : :
1701 : : /* load items[] in ascending order */
6552 tgl@sss.pgh.pa.us 1702 :CBC 6166995 : itemIndex = 0;
1703 : :
1704 : 6166995 : offnum = Max(offnum, minoff);
1705 : :
1706 [ + + ]: 24828751 : while (offnum <= maxoff)
1707 : : {
1849 pg@bowt.ie 1708 : 23487554 : ItemId iid = PageGetItemId(page, offnum);
1709 : : IndexTuple itup;
1710 : : bool passes_quals;
1711 : :
1712 : : /*
1713 : : * If the scan specifies not to return killed tuples, then we
1714 : : * treat a killed tuple as not passing the qual
1715 : : */
1716 [ + + + + ]: 23487554 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1717 : : {
1718 : 1135641 : offnum = OffsetNumberNext(offnum);
1719 : 1135641 : continue;
1720 : : }
1721 : :
1722 : 22351913 : itup = (IndexTuple) PageGetItem(page, iid);
128 pg@bowt.ie 1723 [ - + ]:GNC 22351913 : Assert(!BTreeTupleIsPivot(itup));
1724 : :
8 1725 : 22351913 : pstate.offnum = offnum;
1726 : 22351913 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1727 : : itup, indnatts);
1728 : :
1729 : : /*
1730 : : * Check if we need to skip ahead to a later tuple (only possible
1731 : : * when the scan uses array keys)
1732 : : */
1733 [ + + + + : 22351913 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ - + + ]
1734 : : {
1735 [ + - - + ]: 275 : Assert(!passes_quals && pstate.continuescan);
1736 [ - + ]: 275 : Assert(offnum < pstate.skip);
1737 : :
1738 : 275 : offnum = pstate.skip;
1739 : 275 : pstate.skip = InvalidOffsetNumber;
1740 : 275 : continue;
1741 : : }
1742 : :
191 akorotkov@postgresql 1743 [ + + ]: 22351638 : if (passes_quals)
1744 : : {
1745 : : /* tuple passes all scan key conditions */
8 pg@bowt.ie 1746 : 17327165 : pstate.firstmatch = true;
1509 pg@bowt.ie 1747 [ + + ]:CBC 17327165 : if (!BTreeTupleIsPosting(itup))
1748 : : {
1749 : : /* Remember it */
1750 : 17047537 : _bt_saveitem(so, itemIndex, offnum, itup);
1751 : 17047537 : itemIndex++;
1752 : : }
1753 : : else
1754 : : {
1755 : : int tupleOffset;
1756 : :
1757 : : /*
1758 : : * Set up state to return posting list, and remember first
1759 : : * TID
1760 : : */
1761 : : tupleOffset =
1762 : 279628 : _bt_setuppostingitems(so, itemIndex, offnum,
1763 : : BTreeTupleGetPostingN(itup, 0),
1764 : : itup);
1765 : 279628 : itemIndex++;
1766 : : /* Remember additional TIDs */
1767 [ + + ]: 1474790 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1768 : : {
1769 : 1195162 : _bt_savepostingitem(so, itemIndex, offnum,
1770 : : BTreeTupleGetPostingN(itup, i),
1771 : : tupleOffset);
1772 : 1195162 : itemIndex++;
1773 : : }
1774 : : }
1775 : : }
1776 : : /* When !continuescan, there can't be any more matches, so stop */
8 pg@bowt.ie 1777 [ + + ]:GNC 22351638 : if (!pstate.continuescan)
6552 tgl@sss.pgh.pa.us 1778 :CBC 4825798 : break;
1779 : :
1780 : 17525840 : offnum = OffsetNumberNext(offnum);
1781 : : }
1782 : :
1783 : : /*
1784 : : * We don't need to visit page to the right when the high key
1785 : : * indicates that no more matches will be found there.
1786 : : *
1787 : : * Checking the high key like this works out more often than you might
1788 : : * think. Leaf page splits pick a split point between the two most
1789 : : * dissimilar tuples (this is weighed against the need to evenly share
1790 : : * free space). Leaf pages with high key attribute values that can
1791 : : * only appear on non-pivot tuples on the right sibling page are
1792 : : * common.
1793 : : */
8 pg@bowt.ie 1794 [ + + + + ]:GNC 6166995 : if (pstate.continuescan && !P_RIGHTMOST(opaque))
1795 : : {
1849 pg@bowt.ie 1796 :CBC 61738 : ItemId iid = PageGetItemId(page, P_HIKEY);
1797 : 61738 : IndexTuple itup = (IndexTuple) PageGetItem(page, iid);
1798 : : int truncatt;
1799 : :
1800 [ + - ]: 61738 : truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
8 pg@bowt.ie 1801 :GNC 61738 : pstate.prechecked = false; /* precheck didn't cover HIKEY */
1802 : 61738 : _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
1803 : : }
1804 : :
1805 [ + + ]: 6166995 : if (!pstate.continuescan)
1849 pg@bowt.ie 1806 :CBC 4856744 : so->currPos.moreRight = false;
1807 : :
1509 1808 [ - + ]: 6166995 : Assert(itemIndex <= MaxTIDsPerBTreePage);
6552 tgl@sss.pgh.pa.us 1809 : 6166995 : so->currPos.firstItem = 0;
1810 : 6166995 : so->currPos.lastItem = itemIndex - 1;
1811 : 6166995 : so->currPos.itemIndex = 0;
1812 : : }
1813 : : else
1814 : : {
1815 : : /* SK_SEARCHARRAY backward scans must provide final tuple up front */
8 pg@bowt.ie 1816 [ + + + - :GNC 25077 : if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+ + ]
1817 : : {
1818 : 12 : ItemId iid = PageGetItemId(page, minoff);
1819 : :
1820 : 12 : pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
1821 : : }
1822 : :
1823 : : /* load items[] in descending order */
1509 pg@bowt.ie 1824 :CBC 25077 : itemIndex = MaxTIDsPerBTreePage;
1825 : :
6552 tgl@sss.pgh.pa.us 1826 : 25077 : offnum = Min(offnum, maxoff);
1827 : :
1828 [ + + ]: 4202600 : while (offnum >= minoff)
1829 : : {
1849 pg@bowt.ie 1830 : 4177571 : ItemId iid = PageGetItemId(page, offnum);
1831 : : IndexTuple itup;
1832 : : bool tuple_alive;
1833 : : bool passes_quals;
1834 : :
1835 : : /*
1836 : : * If the scan specifies not to return killed tuples, then we
1837 : : * treat a killed tuple as not passing the qual. Most of the
1838 : : * time, it's a win to not bother examining the tuple's index
1839 : : * keys, but just skip to the next tuple (previous, actually,
1840 : : * since we're scanning backwards). However, if this is the first
1841 : : * tuple on the page, we do check the index keys, to prevent
1842 : : * uselessly advancing to the page to the left. This is similar
1843 : : * to the high key optimization used by forward scans.
1844 : : */
1845 [ + + + + ]: 4177571 : if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
1846 : : {
1847 [ + + - + ]: 521309 : Assert(offnum >= P_FIRSTDATAKEY(opaque));
1848 [ + + + + ]: 521309 : if (offnum > P_FIRSTDATAKEY(opaque))
1849 : : {
1850 : 519929 : offnum = OffsetNumberPrev(offnum);
1851 : 519929 : continue;
1852 : : }
1853 : :
1854 : 1380 : tuple_alive = false;
1855 : : }
1856 : : else
1857 : 3656262 : tuple_alive = true;
1858 : :
1859 : 3657642 : itup = (IndexTuple) PageGetItem(page, iid);
128 pg@bowt.ie 1860 [ - + ]:GNC 3657642 : Assert(!BTreeTupleIsPivot(itup));
1861 : :
8 1862 : 3657642 : pstate.offnum = offnum;
1863 : 3657642 : passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
1864 : : itup, indnatts);
1865 : :
1866 : : /*
1867 : : * Check if we need to skip ahead to a later tuple (only possible
1868 : : * when the scan uses array keys)
1869 : : */
1870 [ + + + + : 3657642 : if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+ - + + ]
1871 : : {
1872 [ + - - + ]: 3 : Assert(!passes_quals && pstate.continuescan);
1873 [ - + ]: 3 : Assert(offnum > pstate.skip);
1874 : :
1875 : 3 : offnum = pstate.skip;
1876 : 3 : pstate.skip = InvalidOffsetNumber;
1877 : 3 : continue;
1878 : : }
1879 : :
1849 pg@bowt.ie 1880 [ + + + + ]:CBC 3657639 : if (passes_quals && tuple_alive)
1881 : : {
1882 : : /* tuple passes all scan key conditions */
8 pg@bowt.ie 1883 :GNC 3656148 : pstate.firstmatch = true;
1509 pg@bowt.ie 1884 [ + + ]:CBC 3656148 : if (!BTreeTupleIsPosting(itup))
1885 : : {
1886 : : /* Remember it */
1887 : 3620722 : itemIndex--;
1888 : 3620722 : _bt_saveitem(so, itemIndex, offnum, itup);
1889 : : }
1890 : : else
1891 : : {
1892 : : int tupleOffset;
1893 : :
1894 : : /*
1895 : : * Set up state to return posting list, and remember first
1896 : : * TID.
1897 : : *
1898 : : * Note that we deliberately save/return items from
1899 : : * posting lists in ascending heap TID order for backwards
1900 : : * scans. This allows _bt_killitems() to make a
1901 : : * consistent assumption about the order of items
1902 : : * associated with the same posting list tuple.
1903 : : */
1904 : 35426 : itemIndex--;
1905 : : tupleOffset =
1906 : 35426 : _bt_setuppostingitems(so, itemIndex, offnum,
1907 : : BTreeTupleGetPostingN(itup, 0),
1908 : : itup);
1909 : : /* Remember additional TIDs */
1910 [ + + ]: 126331 : for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1911 : : {
1912 : 90905 : itemIndex--;
1913 : 90905 : _bt_savepostingitem(so, itemIndex, offnum,
1914 : : BTreeTupleGetPostingN(itup, i),
1915 : : tupleOffset);
1916 : : }
1917 : : }
1918 : : }
8 pg@bowt.ie 1919 [ + + ]:GNC 3657639 : if (!pstate.continuescan)
1920 : : {
1921 : : /* there can't be any more matches, so stop */
6552 tgl@sss.pgh.pa.us 1922 :CBC 48 : so->currPos.moreLeft = false;
1923 : 48 : break;
1924 : : }
1925 : :
1926 : 3657591 : offnum = OffsetNumberPrev(offnum);
1927 : : }
1928 : :
1929 [ - + ]: 25077 : Assert(itemIndex >= 0);
1930 : 25077 : so->currPos.firstItem = itemIndex;
1509 pg@bowt.ie 1931 : 25077 : so->currPos.lastItem = MaxTIDsPerBTreePage - 1;
1932 : 25077 : so->currPos.itemIndex = MaxTIDsPerBTreePage - 1;
1933 : : }
1934 : :
6552 tgl@sss.pgh.pa.us 1935 : 6192072 : return (so->currPos.firstItem <= so->currPos.lastItem);
1936 : : }
1937 : :
1938 : : /* Save an index item into so->currPos.items[itemIndex] */
1939 : : static void
4571 1940 : 20668259 : _bt_saveitem(BTScanOpaque so, int itemIndex,
1941 : : OffsetNumber offnum, IndexTuple itup)
1942 : : {
1943 : 20668259 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1944 : :
1509 pg@bowt.ie 1945 [ + - - + ]: 20668259 : Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup));
1946 : :
4571 tgl@sss.pgh.pa.us 1947 : 20668259 : currItem->heapTid = itup->t_tid;
1948 : 20668259 : currItem->indexOffset = offnum;
1949 [ + + ]: 20668259 : if (so->currTuples)
1950 : : {
1951 : 10776942 : Size itupsz = IndexTupleSize(itup);
1952 : :
1953 : 10776942 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1954 : 10776942 : memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
1955 : 10776942 : so->currPos.nextTupleOffset += MAXALIGN(itupsz);
1956 : : }
1957 : 20668259 : }
1958 : :
1959 : : /*
1960 : : * Setup state to save TIDs/items from a single posting list tuple.
1961 : : *
1962 : : * Saves an index item into so->currPos.items[itemIndex] for TID that is
1963 : : * returned to scan first. Second or subsequent TIDs for posting list should
1964 : : * be saved by calling _bt_savepostingitem().
1965 : : *
1966 : : * Returns an offset into tuple storage space that main tuple is stored at if
1967 : : * needed.
1968 : : */
1969 : : static int
1509 pg@bowt.ie 1970 : 315054 : _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
1971 : : ItemPointer heapTid, IndexTuple itup)
1972 : : {
1973 : 315054 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
1974 : :
1975 [ - + ]: 315054 : Assert(BTreeTupleIsPosting(itup));
1976 : :
1977 : 315054 : currItem->heapTid = *heapTid;
1978 : 315054 : currItem->indexOffset = offnum;
1979 [ + + ]: 315054 : if (so->currTuples)
1980 : : {
1981 : : /* Save base IndexTuple (truncate posting list) */
1982 : : IndexTuple base;
1983 : 166462 : Size itupsz = BTreeTupleGetPostingOffset(itup);
1984 : :
1985 : 166462 : itupsz = MAXALIGN(itupsz);
1986 : 166462 : currItem->tupleOffset = so->currPos.nextTupleOffset;
1987 : 166462 : base = (IndexTuple) (so->currTuples + so->currPos.nextTupleOffset);
1988 : 166462 : memcpy(base, itup, itupsz);
1989 : : /* Defensively reduce work area index tuple header size */
1990 : 166462 : base->t_info &= ~INDEX_SIZE_MASK;
1991 : 166462 : base->t_info |= itupsz;
1992 : 166462 : so->currPos.nextTupleOffset += itupsz;
1993 : :
1994 : 166462 : return currItem->tupleOffset;
1995 : : }
1996 : :
1997 : 148592 : return 0;
1998 : : }
1999 : :
2000 : : /*
2001 : : * Save an index item into so->currPos.items[itemIndex] for current posting
2002 : : * tuple.
2003 : : *
2004 : : * Assumes that _bt_setuppostingitems() has already been called for current
2005 : : * posting list tuple. Caller passes its return value as tupleOffset.
2006 : : */
2007 : : static inline void
2008 : 1286067 : _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
2009 : : ItemPointer heapTid, int tupleOffset)
2010 : : {
2011 : 1286067 : BTScanPosItem *currItem = &so->currPos.items[itemIndex];
2012 : :
2013 : 1286067 : currItem->heapTid = *heapTid;
2014 : 1286067 : currItem->indexOffset = offnum;
2015 : :
2016 : : /*
2017 : : * Have index-only scans return the same base IndexTuple for every TID
2018 : : * that originates from the same posting list
2019 : : */
2020 [ + + ]: 1286067 : if (so->currTuples)
2021 : 570377 : currItem->tupleOffset = tupleOffset;
2022 : 1286067 : }
2023 : :
2024 : : /*
2025 : : * _bt_steppage() -- Step to next page containing valid data for scan
2026 : : *
2027 : : * On entry, if so->currPos.buf is valid the buffer is pinned but not locked;
2028 : : * if pinned, we'll drop the pin before moving to next page. The buffer is
2029 : : * not locked on entry.
2030 : : *
2031 : : * For success on a scan using a non-MVCC snapshot we hold a pin, but not a
2032 : : * read lock, on that page. If we do not hold the pin, we set so->currPos.buf
2033 : : * to InvalidBuffer. We return true to indicate success.
2034 : : */
2035 : : static bool
6552 tgl@sss.pgh.pa.us 2036 : 2689117 : _bt_steppage(IndexScanDesc scan, ScanDirection dir)
2037 : : {
8668 2038 : 2689117 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2615 rhaas@postgresql.org 2039 : 2689117 : BlockNumber blkno = InvalidBlockNumber;
2040 : : bool status;
2041 : :
3308 kgrittn@postgresql.o 2042 [ - + - - : 2689117 : Assert(BTScanPosIsValid(so->currPos));
- + ]
2043 : :
2044 : : /* Before leaving current page, deal with any killed items */
6552 tgl@sss.pgh.pa.us 2045 [ + + ]: 2689117 : if (so->numKilled > 0)
3308 kgrittn@postgresql.o 2046 : 35521 : _bt_killitems(scan);
2047 : :
2048 : : /*
2049 : : * Before we modify currPos, make a copy of the page data if there was a
2050 : : * mark position that needs it.
2051 : : */
6443 tgl@sss.pgh.pa.us 2052 [ + + ]: 2689117 : if (so->markItemIndex >= 0)
2053 : : {
2054 : : /* bump pin on current buffer for assignment to mark buffer */
3308 kgrittn@postgresql.o 2055 [ - + - - : 183 : if (BTScanPosIsPinned(so->currPos))
+ + ]
2056 : 175 : IncrBufferRefCount(so->currPos.buf);
6443 tgl@sss.pgh.pa.us 2057 : 183 : memcpy(&so->markPos, &so->currPos,
2058 : : offsetof(BTScanPosData, items[1]) +
2059 : 183 : so->currPos.lastItem * sizeof(BTScanPosItem));
4571 2060 [ + + ]: 183 : if (so->markTuples)
2061 : 174 : memcpy(so->markTuples, so->currTuples,
2062 : 174 : so->currPos.nextTupleOffset);
6443 2063 : 183 : so->markPos.itemIndex = so->markItemIndex;
2064 : 183 : so->markItemIndex = -1;
2065 : :
2066 : : /*
2067 : : * If we're just about to start the next primitive index scan
2068 : : * (possible with a scan that has arrays keys, and needs to skip to
2069 : : * continue in the current scan direction), moreLeft/moreRight only
2070 : : * indicate the end of the current primitive index scan. They must
2071 : : * never be taken to indicate that the top-level index scan has ended
2072 : : * (that would be wrong).
2073 : : *
2074 : : * We could handle this case by treating the current array keys as
2075 : : * markPos state. But depending on the current array state like this
2076 : : * would add complexity. Instead, we just unset markPos's copy of
2077 : : * moreRight or moreLeft (whichever might be affected), while making
2078 : : * btrestpos reset the scan's arrays to their initial scan positions.
2079 : : * In effect, btrestpos leaves advancing the arrays up to the first
2080 : : * _bt_readpage call (that takes place after it has restored markPos).
2081 : : */
8 pg@bowt.ie 2082 [ - + ]:GNC 183 : Assert(so->markPos.dir == dir);
2083 [ - + ]: 183 : if (so->needPrimScan)
2084 : : {
8 pg@bowt.ie 2085 [ # # ]:UNC 0 : if (ScanDirectionIsForward(dir))
2086 : 0 : so->markPos.moreRight = true;
2087 : : else
2088 : 0 : so->markPos.moreLeft = true;
2089 : : }
2090 : : }
2091 : :
9716 bruce@momjian.us 2092 [ + + ]:CBC 2689117 : if (ScanDirectionIsForward(dir))
2093 : : {
2094 : : /* Walk right to the next page with data */
2615 rhaas@postgresql.org 2095 [ + + ]: 2687948 : if (scan->parallel_scan != NULL)
2096 : : {
2097 : : /*
2098 : : * Seize the scan to get the next block number; if the scan has
2099 : : * ended already, bail out.
2100 : : */
8 pg@bowt.ie 2101 :GNC 665 : status = _bt_parallel_seize(scan, &blkno, false);
2615 rhaas@postgresql.org 2102 [ + + ]:CBC 665 : if (!status)
2103 : : {
2104 : : /* release the previous buffer, if pinned */
2105 [ - + - - : 147 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
2106 : 147 : BTScanPosInvalidate(so->currPos);
2107 : 147 : return false;
2108 : : }
2109 : : }
2110 : : else
2111 : : {
2112 : : /* Not parallel, so use the previously-saved nextPage link. */
2113 : 2687283 : blkno = so->currPos.nextPage;
2114 : : }
2115 : :
2116 : : /* Remember we left a page with data */
6552 tgl@sss.pgh.pa.us 2117 : 2687801 : so->currPos.moreLeft = true;
2118 : :
2119 : : /* release the previous buffer, if pinned */
3308 kgrittn@postgresql.o 2120 [ - + - - : 2687801 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
2121 : : }
2122 : : else
2123 : : {
2124 : : /* Remember we left a page with data */
2615 rhaas@postgresql.org 2125 : 1169 : so->currPos.moreRight = true;
2126 : :
2127 [ - + ]: 1169 : if (scan->parallel_scan != NULL)
2128 : : {
2129 : : /*
2130 : : * Seize the scan to get the current block number; if the scan has
2131 : : * ended already, bail out.
2132 : : */
8 pg@bowt.ie 2133 :UNC 0 : status = _bt_parallel_seize(scan, &blkno, false);
2615 rhaas@postgresql.org 2134 [ # # # # :UBC 0 : BTScanPosUnpinIfPinned(so->currPos);
# # ]
2135 [ # # ]: 0 : if (!status)
2136 : : {
2137 : 0 : BTScanPosInvalidate(so->currPos);
2138 : 0 : return false;
2139 : : }
2140 : : }
2141 : : else
2142 : : {
2143 : : /* Not parallel, so just use our own notion of the current page */
2615 rhaas@postgresql.org 2144 :CBC 1169 : blkno = so->currPos.currPage;
2145 : : }
2146 : : }
2147 : :
2148 [ + + ]: 2688970 : if (!_bt_readnextpage(scan, blkno, dir))
2149 : 2672069 : return false;
2150 : :
2151 : : /* We have at least one item to return as scan's next item */
2152 : 16901 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2153 : :
2154 : 16901 : return true;
2155 : : }
2156 : :
2157 : : /*
2158 : : * _bt_readnextpage() -- Read next page containing valid data for scan
2159 : : *
2160 : : * On success exit, so->currPos is updated to contain data from the next
2161 : : * interesting page, and we return true. Caller must release the lock (and
2162 : : * maybe the pin) on the buffer on success exit.
2163 : : *
2164 : : * If there are no more matching records in the given direction, we drop all
2165 : : * locks and pins, set so->currPos.buf to InvalidBuffer, and return false.
2166 : : */
2167 : : static bool
2168 : 2689094 : _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2169 : : {
2170 : 2689094 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2171 : : Relation rel;
2172 : : Page page;
2173 : : BTPageOpaque opaque;
2174 : : bool status;
2175 : :
2176 : 2689094 : rel = scan->indexRelation;
2177 : :
2178 [ + + ]: 2689094 : if (ScanDirectionIsForward(dir))
2179 : : {
2180 : : for (;;)
2181 : : {
2182 : : /*
2183 : : * if we're at end of scan, give up and mark parallel scan as
2184 : : * done, so that all the workers can finish their scan
2185 : : */
6552 tgl@sss.pgh.pa.us 2186 [ + + + + ]: 2688709 : if (blkno == P_NONE || !so->currPos.moreRight)
2187 : : {
2615 rhaas@postgresql.org 2188 : 2672026 : _bt_parallel_done(scan);
3308 kgrittn@postgresql.o 2189 : 2672026 : BTScanPosInvalidate(so->currPos);
6552 tgl@sss.pgh.pa.us 2190 : 2672026 : return false;
2191 : : }
2192 : : /* check for interrupts while we're not holding any buffer lock */
5458 2193 [ - + ]: 16683 : CHECK_FOR_INTERRUPTS();
2194 : : /* step right one page */
309 pg@bowt.ie 2195 : 16683 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2916 kgrittn@postgresql.o 2196 : 16683 : page = BufferGetPage(so->currPos.buf);
744 michael@paquier.xyz 2197 : 16683 : opaque = BTPageGetOpaque(page);
2198 : : /* check for deleted page */
6552 tgl@sss.pgh.pa.us 2199 [ + - ]: 16683 : if (!P_IGNORE(opaque))
2200 : : {
4687 heikki.linnakangas@i 2201 : 16683 : PredicateLockPage(rel, blkno, scan->xs_snapshot);
2202 : : /* see if there are any matches on this page */
2203 : : /* note that this will clear moreRight if we can stop */
109 akorotkov@postgresql 2204 [ + + + + ]:GNC 16683 : if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque), false))
6552 tgl@sss.pgh.pa.us 2205 :CBC 15899 : break;
2206 : : }
2314 rhaas@postgresql.org 2207 [ # # ]:UBC 0 : else if (scan->parallel_scan != NULL)
2208 : : {
2209 : : /* allow next page be processed by parallel worker */
2210 : 0 : _bt_parallel_release(scan, opaque->btpo_next);
2211 : : }
2212 : :
2213 : : /* nope, keep going */
2615 rhaas@postgresql.org 2214 [ - + ]:CBC 784 : if (scan->parallel_scan != NULL)
2215 : : {
2088 akapila@postgresql.o 2216 :UBC 0 : _bt_relbuf(rel, so->currPos.buf);
8 pg@bowt.ie 2217 :UNC 0 : status = _bt_parallel_seize(scan, &blkno, false);
2615 rhaas@postgresql.org 2218 [ # # ]:UBC 0 : if (!status)
2219 : : {
2220 : 0 : BTScanPosInvalidate(so->currPos);
2221 : 0 : return false;
2222 : : }
2223 : : }
2224 : : else
2225 : : {
2615 rhaas@postgresql.org 2226 :CBC 784 : blkno = opaque->btpo_next;
2088 akapila@postgresql.o 2227 : 784 : _bt_relbuf(rel, so->currPos.buf);
2228 : : }
2229 : : }
2230 : : }
2231 : : else
2232 : : {
2233 : : /*
2234 : : * Should only happen in parallel cases, when some other backend
2235 : : * advanced the scan.
2236 : : */
2615 rhaas@postgresql.org 2237 [ - + ]: 1169 : if (so->currPos.currPage != blkno)
2238 : : {
2615 rhaas@postgresql.org 2239 [ # # # # :UBC 0 : BTScanPosUnpinIfPinned(so->currPos);
# # ]
2240 : 0 : so->currPos.currPage = blkno;
2241 : : }
2242 : :
2243 : : /*
2244 : : * Walk left to the next page with data. This is much more complex
2245 : : * than the walk-right case because of the possibility that the page
2246 : : * to our left splits while we are in flight to it, plus the
2247 : : * possibility that the page we were on gets deleted after we leave
2248 : : * it. See nbtree/README for details.
2249 : : *
2250 : : * It might be possible to rearrange this code to have less overhead
2251 : : * in pinning and locking, but that would require capturing the left
2252 : : * sibling block number when the page is initially read, and then
2253 : : * optimistically starting there (rather than pinning the page twice).
2254 : : * It is not clear that this would be worth the complexity.
2255 : : */
3308 kgrittn@postgresql.o 2256 [ - + - - :CBC 1169 : if (BTScanPosIsPinned(so->currPos))
+ + ]
1363 pg@bowt.ie 2257 : 1147 : _bt_lockbuf(rel, so->currPos.buf, BT_READ);
2258 : : else
309 2259 : 22 : so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
2260 : :
2261 : : for (;;)
2262 : : {
2263 : : /* Done if we know there are no matching keys to the left */
6552 tgl@sss.pgh.pa.us 2264 [ + + ]: 1175 : if (!so->currPos.moreLeft)
2265 : : {
2266 : 27 : _bt_relbuf(rel, so->currPos.buf);
2615 rhaas@postgresql.org 2267 : 27 : _bt_parallel_done(scan);
3308 kgrittn@postgresql.o 2268 : 27 : BTScanPosInvalidate(so->currPos);
6552 tgl@sss.pgh.pa.us 2269 : 27 : return false;
2270 : : }
2271 : :
2272 : : /* Step to next physical page */
219 tmunro@postgresql.or 2273 :GNC 1148 : so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
2274 : :
2275 : : /* if we're physically at end of index, return failure */
6552 tgl@sss.pgh.pa.us 2276 [ + + ]:CBC 1148 : if (so->currPos.buf == InvalidBuffer)
2277 : : {
2615 rhaas@postgresql.org 2278 : 16 : _bt_parallel_done(scan);
3308 kgrittn@postgresql.o 2279 : 16 : BTScanPosInvalidate(so->currPos);
6552 tgl@sss.pgh.pa.us 2280 : 16 : return false;
2281 : : }
2282 : :
2283 : : /*
2284 : : * Okay, we managed to move left to a non-deleted page. Done if
2285 : : * it's not half-dead and contains matching tuples. Else loop back
2286 : : * and do it all again.
2287 : : */
2916 kgrittn@postgresql.o 2288 : 1132 : page = BufferGetPage(so->currPos.buf);
744 michael@paquier.xyz 2289 : 1132 : opaque = BTPageGetOpaque(page);
6552 tgl@sss.pgh.pa.us 2290 [ + - ]: 1132 : if (!P_IGNORE(opaque))
2291 : : {
4687 heikki.linnakangas@i 2292 : 1132 : PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
2293 : : /* see if there are any matches on this page */
2294 : : /* note that this will clear moreLeft if we can stop */
109 akorotkov@postgresql 2295 [ + + ]:GNC 1132 : if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false))
6552 tgl@sss.pgh.pa.us 2296 :CBC 1126 : break;
2297 : : }
2314 rhaas@postgresql.org 2298 [ # # ]:UBC 0 : else if (scan->parallel_scan != NULL)
2299 : : {
2300 : : /* allow next page be processed by parallel worker */
2301 : 0 : _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
2302 : : }
2303 : :
2304 : : /*
2305 : : * For parallel scans, get the last page scanned as it is quite
2306 : : * possible that by the time we try to seize the scan, some other
2307 : : * worker has already advanced the scan to a different page. We
2308 : : * must continue based on the latest page scanned by any worker.
2309 : : */
2615 rhaas@postgresql.org 2310 [ - + ]:CBC 6 : if (scan->parallel_scan != NULL)
2311 : : {
2615 rhaas@postgresql.org 2312 :UBC 0 : _bt_relbuf(rel, so->currPos.buf);
8 pg@bowt.ie 2313 :UNC 0 : status = _bt_parallel_seize(scan, &blkno, false);
2615 rhaas@postgresql.org 2314 [ # # ]:UBC 0 : if (!status)
2315 : : {
2316 : 0 : BTScanPosInvalidate(so->currPos);
2317 : 0 : return false;
2318 : : }
309 pg@bowt.ie 2319 : 0 : so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
2320 : : }
2321 : : }
2322 : : }
2323 : :
2615 rhaas@postgresql.org 2324 :CBC 17025 : return true;
2325 : : }
2326 : :
2327 : : /*
2328 : : * _bt_parallel_readpage() -- Read current page containing valid data for scan
2329 : : *
2330 : : * On success, release lock and maybe pin on buffer. We return true to
2331 : : * indicate success.
2332 : : */
2333 : : static bool
2334 : 124 : _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
2335 : : {
2336 : 124 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2337 : :
8 pg@bowt.ie 2338 [ - + ]:GNC 124 : Assert(!so->needPrimScan);
2339 : :
2615 rhaas@postgresql.org 2340 :CBC 124 : _bt_initialize_more_data(so, dir);
2341 : :
2342 [ - + ]: 124 : if (!_bt_readnextpage(scan, blkno, dir))
2615 rhaas@postgresql.org 2343 :UBC 0 : return false;
2344 : :
2345 : : /* We have at least one item to return as scan's next item */
3308 kgrittn@postgresql.o 2346 :CBC 124 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2347 : :
9357 bruce@momjian.us 2348 : 124 : return true;
2349 : : }
2350 : :
2351 : : /*
2352 : : * _bt_walk_left() -- step left one page, if possible
2353 : : *
2354 : : * The given buffer must be pinned and read-locked. This will be dropped
2355 : : * before stepping left. On return, we have pin and read lock on the
2356 : : * returned page, instead.
2357 : : *
2358 : : * Returns InvalidBuffer if there is no page to the left (no lock is held
2359 : : * in that case).
2360 : : *
2361 : : * It is possible for the returned leaf page to be half-dead; caller must
2362 : : * check that condition and step left again when required.
2363 : : */
2364 : : static Buffer
219 tmunro@postgresql.or 2365 :GNC 1148 : _bt_walk_left(Relation rel, Buffer buf)
2366 : : {
2367 : : Page page;
2368 : : BTPageOpaque opaque;
2369 : :
2916 kgrittn@postgresql.o 2370 :CBC 1148 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2371 : 1148 : opaque = BTPageGetOpaque(page);
2372 : :
2373 : : for (;;)
7722 tgl@sss.pgh.pa.us 2374 :UBC 0 : {
2375 : : BlockNumber obknum;
2376 : : BlockNumber lblkno;
2377 : : BlockNumber blkno;
2378 : : int tries;
2379 : :
2380 : : /* if we're at end of tree, release buf and return failure */
7722 tgl@sss.pgh.pa.us 2381 [ + + ]:CBC 1148 : if (P_LEFTMOST(opaque))
2382 : : {
2383 : 16 : _bt_relbuf(rel, buf);
2384 : 16 : break;
2385 : : }
2386 : : /* remember original page we are stepping left from */
2387 : 1132 : obknum = BufferGetBlockNumber(buf);
2388 : : /* step left */
2389 : 1132 : blkno = lblkno = opaque->btpo_prev;
5458 2390 : 1132 : _bt_relbuf(rel, buf);
2391 : : /* check for interrupts while we're not holding any buffer lock */
2392 [ - + ]: 1132 : CHECK_FOR_INTERRUPTS();
309 pg@bowt.ie 2393 : 1132 : buf = _bt_getbuf(rel, blkno, BT_READ);
2916 kgrittn@postgresql.o 2394 : 1132 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2395 : 1132 : opaque = BTPageGetOpaque(page);
2396 : :
2397 : : /*
2398 : : * If this isn't the page we want, walk right till we find what we
2399 : : * want --- but go no more than four hops (an arbitrary limit). If we
2400 : : * don't find the correct page by then, the most likely bet is that
2401 : : * the original page got deleted and isn't in the sibling chain at all
2402 : : * anymore, not that its left sibling got split more than four times.
2403 : : *
2404 : : * Note that it is correct to test P_ISDELETED not P_IGNORE here,
2405 : : * because half-dead pages are still in the sibling chain.
2406 : : */
7722 tgl@sss.pgh.pa.us 2407 : 1132 : tries = 0;
2408 : : for (;;)
2409 : : {
2410 [ + - + - ]: 1132 : if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
2411 : : {
2412 : : /* Found desired page, return it */
2413 : 1132 : return buf;
2414 : : }
7722 tgl@sss.pgh.pa.us 2415 [ # # # # ]:UBC 0 : if (P_RIGHTMOST(opaque) || ++tries > 4)
2416 : : break;
2417 : 0 : blkno = opaque->btpo_next;
7298 2418 : 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2916 kgrittn@postgresql.o 2419 : 0 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2420 : 0 : opaque = BTPageGetOpaque(page);
2421 : : }
2422 : :
2423 : : /* Return to the original page to see what's up */
7298 tgl@sss.pgh.pa.us 2424 : 0 : buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
2916 kgrittn@postgresql.o 2425 : 0 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2426 : 0 : opaque = BTPageGetOpaque(page);
7722 tgl@sss.pgh.pa.us 2427 [ # # ]: 0 : if (P_ISDELETED(opaque))
2428 : : {
2429 : : /*
2430 : : * It was deleted. Move right to first nondeleted page (there
2431 : : * must be one); that is the page that has acquired the deleted
2432 : : * one's keyspace, so stepping left from it will take us where we
2433 : : * want to be.
2434 : : */
2435 : : for (;;)
2436 : : {
2437 [ # # ]: 0 : if (P_RIGHTMOST(opaque))
5949 2438 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2439 : : RelationGetRelationName(rel));
7722 2440 : 0 : blkno = opaque->btpo_next;
7298 2441 : 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2916 kgrittn@postgresql.o 2442 : 0 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2443 : 0 : opaque = BTPageGetOpaque(page);
7722 tgl@sss.pgh.pa.us 2444 [ # # ]: 0 : if (!P_ISDELETED(opaque))
2445 : 0 : break;
2446 : : }
2447 : :
2448 : : /*
2449 : : * Now return to top of loop, resetting obknum to point to this
2450 : : * nondeleted page, and try again.
2451 : : */
2452 : : }
2453 : : else
2454 : : {
2455 : : /*
2456 : : * It wasn't deleted; the explanation had better be that the page
2457 : : * to the left got split or deleted. Without this check, we'd go
2458 : : * into an infinite loop if there's anything wrong.
2459 : : */
2460 [ # # ]: 0 : if (opaque->btpo_prev == lblkno)
5949 2461 [ # # ]: 0 : elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
2462 : : obknum, RelationGetRelationName(rel));
2463 : : /* Okay to try again with new lblkno value */
2464 : : }
2465 : : }
2466 : :
7722 tgl@sss.pgh.pa.us 2467 :CBC 16 : return InvalidBuffer;
2468 : : }
2469 : :
2470 : : /*
2471 : : * _bt_get_endpoint() -- Find the first or last page on a given tree level
2472 : : *
2473 : : * If the index is empty, we will return InvalidBuffer; any other failure
2474 : : * condition causes ereport(). We will not return a dead page.
2475 : : *
2476 : : * The returned buffer is pinned and read-locked.
2477 : : */
2478 : : Buffer
219 tmunro@postgresql.or 2479 :GNC 38763 : _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
2480 : : {
2481 : : Buffer buf;
2482 : : Page page;
2483 : : BTPageOpaque opaque;
2484 : : OffsetNumber offnum;
2485 : : BlockNumber blkno;
2486 : : IndexTuple itup;
2487 : :
2488 : : /*
2489 : : * If we are looking for a leaf page, okay to descend from fast root;
2490 : : * otherwise better descend from true root. (There is no point in being
2491 : : * smarter about intermediate levels.)
2492 : : */
7723 tgl@sss.pgh.pa.us 2493 [ + + ]:CBC 38763 : if (level == 0)
309 pg@bowt.ie 2494 : 38751 : buf = _bt_getroot(rel, NULL, BT_READ);
2495 : : else
2496 : 12 : buf = _bt_gettrueroot(rel);
2497 : :
7723 tgl@sss.pgh.pa.us 2498 [ + + ]: 38763 : if (!BufferIsValid(buf))
2499 : 3152 : return InvalidBuffer;
2500 : :
2916 kgrittn@postgresql.o 2501 : 35611 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2502 : 35611 : opaque = BTPageGetOpaque(page);
2503 : :
2504 : : for (;;)
2505 : : {
2506 : : /*
2507 : : * If we landed on a deleted page, step right to find a live page
2508 : : * (there must be one). Also, if we want the rightmost page, step
2509 : : * right if needed to get to it (this could happen if the page split
2510 : : * since we obtained a pointer to it).
2511 : : */
7722 tgl@sss.pgh.pa.us 2512 [ - + + + ]: 50490 : while (P_IGNORE(opaque) ||
7723 2513 [ - + ]: 33 : (rightmost && !P_RIGHTMOST(opaque)))
2514 : : {
7723 tgl@sss.pgh.pa.us 2515 :UBC 0 : blkno = opaque->btpo_next;
2516 [ # # ]: 0 : if (blkno == P_NONE)
5949 2517 [ # # ]: 0 : elog(ERROR, "fell off the end of index \"%s\"",
2518 : : RelationGetRelationName(rel));
7298 2519 : 0 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2916 kgrittn@postgresql.o 2520 : 0 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2521 : 0 : opaque = BTPageGetOpaque(page);
2522 : : }
2523 : :
2524 : : /* Done? */
1145 pg@bowt.ie 2525 [ + + ]:CBC 50490 : if (opaque->btpo_level == level)
7723 tgl@sss.pgh.pa.us 2526 : 35611 : break;
1145 pg@bowt.ie 2527 [ - + ]: 14879 : if (opaque->btpo_level < level)
1718 peter@eisentraut.org 2528 [ # # ]:UBC 0 : ereport(ERROR,
2529 : : (errcode(ERRCODE_INDEX_CORRUPTED),
2530 : : errmsg_internal("btree level %u not found in index \"%s\"",
2531 : : level, RelationGetRelationName(rel))));
2532 : :
2533 : : /* Descend to leftmost or rightmost child page */
7723 tgl@sss.pgh.pa.us 2534 [ + + ]:CBC 14879 : if (rightmost)
2535 : 3 : offnum = PageGetMaxOffsetNumber(page);
2536 : : else
2537 [ + - ]: 14876 : offnum = P_FIRSTDATAKEY(opaque);
2538 : :
6654 2539 : 14879 : itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
1581 pg@bowt.ie 2540 : 14879 : blkno = BTreeTupleGetDownLink(itup);
2541 : :
7298 tgl@sss.pgh.pa.us 2542 : 14879 : buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
2916 kgrittn@postgresql.o 2543 : 14879 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2544 : 14879 : opaque = BTPageGetOpaque(page);
2545 : : }
2546 : :
7723 tgl@sss.pgh.pa.us 2547 : 35611 : return buf;
2548 : : }
2549 : :
2550 : : /*
2551 : : * _bt_endpoint() -- Find the first or last page in the index, and scan
2552 : : * from there to the first key satisfying all the quals.
2553 : : *
2554 : : * This is used by _bt_first() to set up a scan when we've determined
2555 : : * that the scan must start at the beginning or end of the index (for
2556 : : * a forward or backward scan respectively). Exit conditions are the
2557 : : * same as for _bt_first().
2558 : : */
2559 : : static bool
10141 scrappy@hub.org 2560 : 38751 : _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
2561 : : {
6552 tgl@sss.pgh.pa.us 2562 : 38751 : Relation rel = scan->indexRelation;
2563 : 38751 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2564 : : Buffer buf;
2565 : : Page page;
2566 : : BTPageOpaque opaque;
2567 : : OffsetNumber start;
2568 : : BTScanPosItem *currItem;
2569 : :
2570 : : /*
2571 : : * Scan down to the leftmost or rightmost leaf page. This is a simplified
2572 : : * version of _bt_search(). We don't maintain a stack since we know we
2573 : : * won't need it.
2574 : : */
219 tmunro@postgresql.or 2575 :GNC 38751 : buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
2576 : :
8424 bruce@momjian.us 2577 [ + + ]:CBC 38751 : if (!BufferIsValid(buf))
2578 : : {
2579 : : /*
2580 : : * Empty index. Lock the whole relation, as nothing finer to lock
2581 : : * exists.
2582 : : */
4687 heikki.linnakangas@i 2583 : 3152 : PredicateLockRelation(rel, scan->xs_snapshot);
3308 kgrittn@postgresql.o 2584 : 3152 : BTScanPosInvalidate(so->currPos);
8000 tgl@sss.pgh.pa.us 2585 : 3152 : return false;
2586 : : }
2587 : :
4687 heikki.linnakangas@i 2588 : 35599 : PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
2916 kgrittn@postgresql.o 2589 : 35599 : page = BufferGetPage(buf);
744 michael@paquier.xyz 2590 : 35599 : opaque = BTPageGetOpaque(page);
7723 tgl@sss.pgh.pa.us 2591 [ - + ]: 35599 : Assert(P_ISLEAF(opaque));
2592 : :
9716 bruce@momjian.us 2593 [ + + ]: 35599 : if (ScanDirectionIsForward(dir))
2594 : : {
2595 : : /* There could be dead pages to the left, so not this: */
2596 : : /* Assert(P_LEFTMOST(opaque)); */
2597 : :
8668 tgl@sss.pgh.pa.us 2598 [ + + ]: 35569 : start = P_FIRSTDATAKEY(opaque);
2599 : : }
9716 bruce@momjian.us 2600 [ + - ]: 30 : else if (ScanDirectionIsBackward(dir))
2601 : : {
8668 tgl@sss.pgh.pa.us 2602 [ - + ]: 30 : Assert(P_RIGHTMOST(opaque));
2603 : :
2604 : 30 : start = PageGetMaxOffsetNumber(page);
2605 : : }
2606 : : else
2607 : : {
7573 tgl@sss.pgh.pa.us 2608 [ # # ]:UBC 0 : elog(ERROR, "invalid scan direction: %d", (int) dir);
2609 : : start = 0; /* keep compiler quiet */
2610 : : }
2611 : :
2612 : : /* remember which buffer we have pinned */
6552 tgl@sss.pgh.pa.us 2613 :CBC 35599 : so->currPos.buf = buf;
2614 : :
2615 rhaas@postgresql.org 2615 : 35599 : _bt_initialize_more_data(so, dir);
2616 : :
2617 : : /*
2618 : : * Now load data from the first page of the scan.
2619 : : */
23 akorotkov@postgresql 2620 [ + + ]:GNC 35599 : if (!_bt_readpage(scan, dir, start, true))
2621 : : {
2622 : : /*
2623 : : * There's no actually-matching data on this page. Try to advance to
2624 : : * the next page. Return false if there's no matching data at all.
2625 : : */
1363 pg@bowt.ie 2626 :CBC 834 : _bt_unlockbuf(scan->indexRelation, so->currPos.buf);
6552 tgl@sss.pgh.pa.us 2627 [ + + ]: 834 : if (!_bt_steppage(scan, dir))
2628 : 792 : return false;
2629 : : }
2630 : : else
2631 : : {
2632 : : /* We have at least one item to return as scan's first item */
3308 kgrittn@postgresql.o 2633 : 34765 : _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2634 : : }
2635 : :
2636 : : /* OK, itemIndex says what to return */
4571 tgl@sss.pgh.pa.us 2637 : 34807 : currItem = &so->currPos.items[so->currPos.itemIndex];
1861 andres@anarazel.de 2638 : 34807 : scan->xs_heaptid = currItem->heapTid;
4571 tgl@sss.pgh.pa.us 2639 [ + + ]: 34807 : if (scan->xs_want_itup)
2640 : 32271 : scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
2641 : :
6552 2642 : 34807 : return true;
2643 : : }
2644 : :
2645 : : /*
2646 : : * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
2647 : : * from currPos
2648 : : */
2649 : : static inline void
2615 rhaas@postgresql.org 2650 : 6174381 : _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
2651 : : {
8 pg@bowt.ie 2652 :GNC 6174381 : so->currPos.dir = dir;
2653 [ + + ]: 6174381 : if (so->needPrimScan)
2654 : : {
2655 [ - + ]: 285 : Assert(so->numArrayKeys);
2656 : :
2657 : 285 : so->currPos.moreLeft = true;
2658 : 285 : so->currPos.moreRight = true;
2659 : 285 : so->needPrimScan = false;
2660 : : }
2661 [ + + ]: 6174096 : else if (ScanDirectionIsForward(dir))
2662 : : {
2615 rhaas@postgresql.org 2663 :CBC 6150154 : so->currPos.moreLeft = false;
2664 : 6150154 : so->currPos.moreRight = true;
2665 : : }
2666 : : else
2667 : : {
2668 : 23942 : so->currPos.moreLeft = true;
2669 : 23942 : so->currPos.moreRight = false;
2670 : : }
2671 : 6174381 : so->numKilled = 0; /* just paranoia */
2672 : 6174381 : so->markItemIndex = -1; /* ditto */
2673 : 6174381 : }
|