Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtree.c
4 : : * Implementation of Lehman and Yao's btree management algorithm for
5 : : * Postgres.
6 : : *
7 : : * NOTES
8 : : * This file contains only the public interface routines.
9 : : *
10 : : *
11 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
12 : : * Portions Copyright (c) 1994, Regents of the University of California
13 : : *
14 : : * IDENTIFICATION
15 : : * src/backend/access/nbtree/nbtree.c
16 : : *
17 : : *-------------------------------------------------------------------------
18 : : */
19 : : #include "postgres.h"
20 : :
21 : : #include "access/nbtree.h"
22 : : #include "access/relscan.h"
23 : : #include "access/xloginsert.h"
24 : : #include "commands/progress.h"
25 : : #include "commands/vacuum.h"
26 : : #include "miscadmin.h"
27 : : #include "nodes/execnodes.h"
28 : : #include "pgstat.h"
29 : : #include "storage/bulk_write.h"
30 : : #include "storage/condition_variable.h"
31 : : #include "storage/indexfsm.h"
32 : : #include "storage/ipc.h"
33 : : #include "storage/lmgr.h"
34 : : #include "storage/smgr.h"
35 : : #include "utils/fmgrprotos.h"
36 : : #include "utils/index_selfuncs.h"
37 : : #include "utils/memutils.h"
38 : :
39 : :
40 : : /*
41 : : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
42 : : *
43 : : * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
44 : : * scan to advance it via another call to _bt_first.
45 : : *
46 : : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
47 : : * a new page; others must wait.
48 : : *
49 : : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
50 : : * to a new page; some process can start doing that.
51 : : *
52 : : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
53 : : */
54 : : typedef enum
55 : : {
56 : : BTPARALLEL_NOT_INITIALIZED,
57 : : BTPARALLEL_NEED_PRIMSCAN,
58 : : BTPARALLEL_ADVANCING,
59 : : BTPARALLEL_IDLE,
60 : : BTPARALLEL_DONE,
61 : : } BTPS_State;
62 : :
63 : : /*
64 : : * BTParallelScanDescData contains btree specific shared information required
65 : : * for parallel scan.
66 : : */
67 : : typedef struct BTParallelScanDescData
68 : : {
69 : : BlockNumber btps_scanPage; /* latest or next page to be scanned */
70 : : BTPS_State btps_pageStatus; /* indicates whether next page is
71 : : * available for scan. see above for
72 : : * possible states of parallel scan. */
73 : : slock_t btps_mutex; /* protects above variables, btps_arrElems */
74 : : ConditionVariable btps_cv; /* used to synchronize parallel scan */
75 : :
76 : : /*
77 : : * btps_arrElems is used when scans need to schedule another primitive
78 : : * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
79 : : */
80 : : int btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
81 : : } BTParallelScanDescData;
82 : :
83 : : typedef struct BTParallelScanDescData *BTParallelScanDesc;
84 : :
85 : :
86 : : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
87 : : IndexBulkDeleteCallback callback, void *callback_state,
88 : : BTCycleId cycleid);
89 : : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
90 : : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
91 : : IndexTuple posting,
92 : : OffsetNumber updatedoffset,
93 : : int *nremaining);
94 : :
95 : :
96 : : /*
97 : : * Btree handler function: return IndexAmRoutine with access method parameters
98 : : * and callbacks.
99 : : */
100 : : Datum
3010 tgl@sss.pgh.pa.us 101 :CBC 1405783 : bthandler(PG_FUNCTION_ARGS)
102 : : {
103 : 1405783 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
104 : :
2908 teodor@sigaev.ru 105 : 1405783 : amroutine->amstrategies = BTMaxStrategyNumber;
106 : 1405783 : amroutine->amsupport = BTNProcs;
1476 akorotkov@postgresql 107 : 1405783 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
3010 tgl@sss.pgh.pa.us 108 : 1405783 : amroutine->amcanorder = true;
109 : 1405783 : amroutine->amcanorderbyop = false;
110 : 1405783 : amroutine->amcanbackward = true;
111 : 1405783 : amroutine->amcanunique = true;
112 : 1405783 : amroutine->amcanmulticol = true;
113 : 1405783 : amroutine->amoptionalkey = true;
114 : 1405783 : amroutine->amsearcharray = true;
115 : 1405783 : amroutine->amsearchnulls = true;
116 : 1405783 : amroutine->amstorage = false;
117 : 1405783 : amroutine->amclusterable = true;
118 : 1405783 : amroutine->ampredlocks = true;
2615 rhaas@postgresql.org 119 : 1405783 : amroutine->amcanparallel = true;
128 tomas.vondra@postgre 120 :GNC 1405783 : amroutine->amcanbuildparallel = true;
2199 teodor@sigaev.ru 121 :CBC 1405783 : amroutine->amcaninclude = true;
1551 akapila@postgresql.o 122 : 1405783 : amroutine->amusemaintenanceworkmem = false;
391 tomas.vondra@postgre 123 : 1405783 : amroutine->amsummarizing = false;
1551 akapila@postgresql.o 124 : 1405783 : amroutine->amparallelvacuumoptions =
125 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
3010 tgl@sss.pgh.pa.us 126 : 1405783 : amroutine->amkeytype = InvalidOid;
127 : :
128 : 1405783 : amroutine->ambuild = btbuild;
129 : 1405783 : amroutine->ambuildempty = btbuildempty;
130 : 1405783 : amroutine->aminsert = btinsert;
141 tomas.vondra@postgre 131 :GNC 1405783 : amroutine->aminsertcleanup = NULL;
3010 tgl@sss.pgh.pa.us 132 :CBC 1405783 : amroutine->ambulkdelete = btbulkdelete;
133 : 1405783 : amroutine->amvacuumcleanup = btvacuumcleanup;
134 : 1405783 : amroutine->amcanreturn = btcanreturn;
135 : 1405783 : amroutine->amcostestimate = btcostestimate;
136 : 1405783 : amroutine->amoptions = btoptions;
2801 137 : 1405783 : amroutine->amproperty = btproperty;
1839 alvherre@alvh.no-ip. 138 : 1405783 : amroutine->ambuildphasename = btbuildphasename;
3010 tgl@sss.pgh.pa.us 139 : 1405783 : amroutine->amvalidate = btvalidate;
1352 140 : 1405783 : amroutine->amadjustmembers = btadjustmembers;
3010 141 : 1405783 : amroutine->ambeginscan = btbeginscan;
142 : 1405783 : amroutine->amrescan = btrescan;
143 : 1405783 : amroutine->amgettuple = btgettuple;
144 : 1405783 : amroutine->amgetbitmap = btgetbitmap;
145 : 1405783 : amroutine->amendscan = btendscan;
146 : 1405783 : amroutine->ammarkpos = btmarkpos;
147 : 1405783 : amroutine->amrestrpos = btrestrpos;
2615 rhaas@postgresql.org 148 : 1405783 : amroutine->amestimateparallelscan = btestimateparallelscan;
149 : 1405783 : amroutine->aminitparallelscan = btinitparallelscan;
150 : 1405783 : amroutine->amparallelrescan = btparallelrescan;
151 : :
3010 tgl@sss.pgh.pa.us 152 : 1405783 : PG_RETURN_POINTER(amroutine);
153 : : }
154 : :
155 : : /*
156 : : * btbuildempty() -- build an empty btree index in the initialization fork
157 : : */
158 : : void
159 : 79 : btbuildempty(Relation index)
160 : : {
235 heikki.linnakangas@i 161 : 79 : bool allequalimage = _bt_allequalimage(index, false);
162 : : BulkWriteState *bulkstate;
163 : : BulkWriteBuffer metabuf;
164 : :
51 heikki.linnakangas@i 165 :GNC 79 : bulkstate = smgr_bulk_start_rel(index, INIT_FORKNUM);
166 : :
167 : : /* Construct metapage. */
168 : 79 : metabuf = smgr_bulk_get_buf(bulkstate);
169 : 79 : _bt_initmetapage((Page) metabuf, P_NONE, 0, allequalimage);
170 : 79 : smgr_bulk_write(bulkstate, BTREE_METAPAGE, metabuf, true);
171 : :
172 : 79 : smgr_bulk_finish(bulkstate);
4855 rhaas@postgresql.org 173 :CBC 79 : }
174 : :
175 : : /*
176 : : * btinsert() -- insert an index tuple into a btree.
177 : : *
178 : : * Descend the tree recursively, find the appropriate location for our
179 : : * new tuple, and put it there.
180 : : */
181 : : bool
3010 tgl@sss.pgh.pa.us 182 : 3358778 : btinsert(Relation rel, Datum *values, bool *isnull,
183 : : ItemPointer ht_ctid, Relation heapRel,
184 : : IndexUniqueCheck checkUnique,
185 : : bool indexUnchanged,
186 : : IndexInfo *indexInfo)
187 : : {
188 : : bool result;
189 : : IndexTuple itup;
190 : :
191 : : /* generate an index tuple */
6964 192 : 3358778 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
9716 bruce@momjian.us 193 : 3358778 : itup->t_tid = *ht_ctid;
194 : :
1187 pg@bowt.ie 195 : 3358778 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
196 : :
9716 bruce@momjian.us 197 : 3358520 : pfree(itup);
198 : :
3010 tgl@sss.pgh.pa.us 199 : 3358520 : return result;
200 : : }
201 : :
202 : : /*
203 : : * btgettuple() -- Get the next tuple in the scan.
204 : : */
205 : : bool
206 : 14539978 : btgettuple(IndexScanDesc scan, ScanDirection dir)
207 : : {
7996 208 : 14539978 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
209 : : bool res;
210 : :
211 : : /* btree indexes are never lossy */
5845 212 : 14539978 : scan->xs_recheck = false;
213 : :
214 : : /* Each loop iteration performs another primitive index scan */
215 : : do
216 : : {
217 : : /*
218 : : * If we've already initialized this scan, we can just advance it in
219 : : * the appropriate direction. If we haven't done so yet, we call
220 : : * _bt_first() to get the first item in the scan.
221 : : */
4564 222 [ + + - + : 14540045 : if (!BTScanPosIsValid(so->currPos))
+ + ]
223 : 6414970 : res = _bt_first(scan, dir);
224 : : else
225 : : {
226 : : /*
227 : : * Check to see if we should kill the previously-fetched tuple.
228 : : */
229 [ + + ]: 8125075 : if (scan->kill_prior_tuple)
230 : : {
231 : : /*
232 : : * Yes, remember it for later. (We'll deal with all such
233 : : * tuples at once right before leaving the index page.) The
234 : : * test for numKilled overrun is not just paranoia: if the
235 : : * caller reverses direction in the indexscan then the same
236 : : * item might get entered multiple times. It's not worth
237 : : * trying to optimize that, so we don't detect it, but instead
238 : : * just forget any excess entries.
239 : : */
240 [ + + ]: 191611 : if (so->killedItems == NULL)
241 : 69602 : so->killedItems = (int *)
1509 pg@bowt.ie 242 : 69602 : palloc(MaxTIDsPerBTreePage * sizeof(int));
243 [ + - ]: 191611 : if (so->numKilled < MaxTIDsPerBTreePage)
4564 tgl@sss.pgh.pa.us 244 : 191611 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
245 : : }
246 : :
247 : : /*
248 : : * Now continue the scan.
249 : : */
250 : 8125075 : res = _bt_next(scan, dir);
251 : : }
252 : :
253 : : /* If we have a tuple, return it ... */
254 [ + + ]: 14540044 : if (res)
255 : 11627240 : break;
256 : : /* ... otherwise see if we need another primitive index scan */
8 pg@bowt.ie 257 [ + + + + ]:GNC 2912804 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
258 : :
3010 tgl@sss.pgh.pa.us 259 :CBC 14539977 : return res;
260 : : }
261 : :
262 : : /*
263 : : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
264 : : */
265 : : int64
266 : 5702 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
267 : : {
6958 268 : 5702 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
5848 269 : 5702 : int64 ntids = 0;
270 : : ItemPointer heapTid;
271 : :
272 : : /* Each loop iteration performs another primitive index scan */
273 : : do
274 : : {
275 : : /* Fetch the first page & tuple */
4564 276 [ + + ]: 5920 : if (_bt_first(scan, ForwardScanDirection))
277 : : {
278 : : /* Save tuple ID, and continue scanning */
1861 andres@anarazel.de 279 : 4177 : heapTid = &scan->xs_heaptid;
4564 tgl@sss.pgh.pa.us 280 : 4177 : tbm_add_tuples(tbm, heapTid, 1, false);
281 : 4177 : ntids++;
282 : :
283 : : for (;;)
284 : : {
285 : : /*
286 : : * Advance to next tuple within page. This is the same as the
287 : : * easy case in _bt_next().
288 : : */
289 [ + + ]: 913013 : if (++so->currPos.itemIndex > so->currPos.lastItem)
290 : : {
291 : : /* let _bt_next do the heavy lifting */
292 [ + + ]: 6545 : if (!_bt_next(scan, ForwardScanDirection))
293 : 4177 : break;
294 : : }
295 : :
296 : : /* Save tuple ID, and continue scanning */
297 : 908836 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
298 : 908836 : tbm_add_tuples(tbm, heapTid, 1, false);
299 : 908836 : ntids++;
300 : : }
301 : : }
302 : : /* Now see if we need another primitive index scan */
8 pg@bowt.ie 303 [ + + + + ]:GNC 5920 : } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
304 : :
3010 tgl@sss.pgh.pa.us 305 :CBC 5702 : return ntids;
306 : : }
307 : :
308 : : /*
309 : : * btbeginscan() -- start a scan on a btree index
310 : : */
311 : : IndexScanDesc
312 : 6203297 : btbeginscan(Relation rel, int nkeys, int norderbys)
313 : : {
314 : : IndexScanDesc scan;
315 : : BTScanOpaque so;
316 : :
317 : : /* no order by operators allowed */
4882 318 [ - + ]: 6203297 : Assert(norderbys == 0);
319 : :
320 : : /* get the scan */
321 : 6203297 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
322 : :
323 : : /* allocate private workspace */
324 : 6203297 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
3308 kgrittn@postgresql.o 325 : 6203297 : BTScanPosInvalidate(so->currPos);
326 : 6203297 : BTScanPosInvalidate(so->markPos);
4882 tgl@sss.pgh.pa.us 327 [ + + ]: 6203297 : if (scan->numberOfKeys > 0)
328 : 6197978 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
329 : : else
330 : 5319 : so->keyData = NULL;
331 : :
8 pg@bowt.ie 332 :GNC 6203297 : so->needPrimScan = false;
333 : 6203297 : so->scanBehind = false;
4564 tgl@sss.pgh.pa.us 334 :CBC 6203297 : so->arrayKeys = NULL;
8 pg@bowt.ie 335 :GNC 6203297 : so->orderProcs = NULL;
4564 tgl@sss.pgh.pa.us 336 :CBC 6203297 : so->arrayContext = NULL;
337 : :
4882 338 : 6203297 : so->killedItems = NULL; /* until needed */
339 : 6203297 : so->numKilled = 0;
340 : :
341 : : /*
342 : : * We don't know yet whether the scan will be index-only, so we do not
343 : : * allocate the tuple workspace arrays until btrescan. However, we set up
344 : : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
345 : : */
4571 346 : 6203297 : so->currTuples = so->markTuples = NULL;
347 : :
4564 348 : 6203297 : scan->xs_itupdesc = RelationGetDescr(rel);
349 : :
4882 350 : 6203297 : scan->opaque = so;
351 : :
3010 352 : 6203297 : return scan;
353 : : }
354 : :
355 : : /*
356 : : * btrescan() -- rescan an index relation
357 : : */
358 : : void
359 : 6421077 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
360 : : ScanKey orderbys, int norderbys)
361 : : {
4882 362 : 6421077 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
363 : :
364 : : /* we aren't holding any read locks, but gotta drop the pins */
6552 365 [ + + - + : 6421077 : if (BTScanPosIsValid(so->currPos))
+ + ]
366 : : {
367 : : /* Before leaving current page, deal with any killed items */
368 [ + + ]: 29045 : if (so->numKilled > 0)
3308 kgrittn@postgresql.o 369 : 253 : _bt_killitems(scan);
370 [ - + - - : 29045 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
371 : 29045 : BTScanPosInvalidate(so->currPos);
372 : : }
373 : :
6443 tgl@sss.pgh.pa.us 374 : 6421077 : so->markItemIndex = -1;
8 pg@bowt.ie 375 :GNC 6421077 : so->needPrimScan = false;
376 : 6421077 : so->scanBehind = false;
3308 kgrittn@postgresql.o 377 [ + - - + :CBC 6421077 : BTScanPosUnpinIfPinned(so->markPos);
- + ]
378 : 6421077 : BTScanPosInvalidate(so->markPos);
379 : :
380 : : /*
381 : : * Allocate tuple workspace arrays, if needed for an index-only scan and
382 : : * not already done in a previous rescan call. To save on palloc
383 : : * overhead, both workspaces are allocated as one palloc block; only this
384 : : * function and btendscan know that.
385 : : *
386 : : * NOTE: this data structure also makes it safe to return data from a
387 : : * "name" column, even though btree name_ops uses an underlying storage
388 : : * datatype of cstring. The risk there is that "name" is supposed to be
389 : : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
390 : : * However, since we only return data out of tuples sitting in the
391 : : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
392 : : * data out of the markTuples array --- running off the end of memory for
393 : : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
394 : : * adding special-case treatment for name_ops elsewhere.
395 : : */
4571 tgl@sss.pgh.pa.us 396 [ + + + + ]: 6421077 : if (scan->xs_want_itup && so->currTuples == NULL)
397 : : {
398 : 58923 : so->currTuples = (char *) palloc(BLCKSZ * 2);
399 : 58923 : so->markTuples = so->currTuples + BLCKSZ;
400 : : }
401 : :
402 : : /*
403 : : * Reset the scan keys
404 : : */
7693 405 [ + + + + ]: 6421077 : if (scankey && scan->numberOfKeys > 0)
9716 bruce@momjian.us 406 : 6415713 : memmove(scan->keyData,
407 : : scankey,
408 : 6415713 : scan->numberOfKeys * sizeof(ScanKeyData));
7459 tgl@sss.pgh.pa.us 409 : 6421077 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
8 pg@bowt.ie 410 :GNC 6421077 : so->numArrayKeys = 0; /* ditto */
10141 scrappy@hub.org 411 :CBC 6421077 : }
412 : :
413 : : /*
414 : : * btendscan() -- close down a scan
415 : : */
416 : : void
3010 tgl@sss.pgh.pa.us 417 : 6202609 : btendscan(IndexScanDesc scan)
418 : : {
6552 419 : 6202609 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
420 : :
421 : : /* we aren't holding any read locks, but gotta drop the pins */
422 [ + + - + : 6202609 : if (BTScanPosIsValid(so->currPos))
+ + ]
423 : : {
424 : : /* Before leaving current page, deal with any killed items */
425 [ + + ]: 3472642 : if (so->numKilled > 0)
3308 kgrittn@postgresql.o 426 : 34479 : _bt_killitems(scan);
427 [ - + - - : 3472642 : BTScanPosUnpinIfPinned(so->currPos);
+ + ]
428 : : }
429 : :
6443 tgl@sss.pgh.pa.us 430 : 6202609 : so->markItemIndex = -1;
3308 kgrittn@postgresql.o 431 [ + + - + : 6202609 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
432 : :
433 : : /* No need to invalidate positions, the RAM is about to be freed. */
434 : :
435 : : /* Release storage */
7403 neilc@samurai.com 436 [ + + ]: 6202609 : if (so->keyData != NULL)
9716 bruce@momjian.us 437 : 6197311 : pfree(so->keyData);
438 : : /* so->arrayKeys and so->orderProcs are in arrayContext */
4564 tgl@sss.pgh.pa.us 439 [ + + ]: 6202609 : if (so->arrayContext != NULL)
440 : 419 : MemoryContextDelete(so->arrayContext);
441 [ + + ]: 6202609 : if (so->killedItems != NULL)
442 : 69580 : pfree(so->killedItems);
4571 443 [ + + ]: 6202609 : if (so->currTuples != NULL)
444 : 58895 : pfree(so->currTuples);
445 : : /* so->markTuples should not be pfree'd, see btrescan */
9716 bruce@momjian.us 446 : 6202609 : pfree(so);
10141 scrappy@hub.org 447 : 6202609 : }
448 : :
449 : : /*
450 : : * btmarkpos() -- save current scan position
451 : : */
452 : : void
3010 tgl@sss.pgh.pa.us 453 : 65039 : btmarkpos(IndexScanDesc scan)
454 : : {
6552 455 : 65039 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
456 : :
457 : : /* There may be an old mark with a pin (but no lock). */
3308 kgrittn@postgresql.o 458 [ + + - + : 65039 : BTScanPosUnpinIfPinned(so->markPos);
+ + ]
459 : :
460 : : /*
461 : : * Just record the current itemIndex. If we later step to next page
462 : : * before releasing the marked position, _bt_steppage makes a full copy of
463 : : * the currPos struct in markPos. If (as often happens) the mark is moved
464 : : * before we leave the page, we don't have to do that work.
465 : : */
6552 tgl@sss.pgh.pa.us 466 [ - + - - : 65039 : if (BTScanPosIsValid(so->currPos))
+ - ]
6443 467 : 65039 : so->markItemIndex = so->currPos.itemIndex;
468 : : else
469 : : {
3308 kgrittn@postgresql.o 470 :UBC 0 : BTScanPosInvalidate(so->markPos);
6443 tgl@sss.pgh.pa.us 471 : 0 : so->markItemIndex = -1;
472 : : }
10141 scrappy@hub.org 473 :CBC 65039 : }
474 : :
475 : : /*
476 : : * btrestrpos() -- restore scan to last saved position
477 : : */
478 : : void
3010 tgl@sss.pgh.pa.us 479 : 27015 : btrestrpos(IndexScanDesc scan)
480 : : {
6552 481 : 27015 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
482 : :
6443 483 [ + + ]: 27015 : if (so->markItemIndex >= 0)
484 : : {
485 : : /*
486 : : * The scan has never moved to a new page since the last mark. Just
487 : : * restore the itemIndex.
488 : : *
489 : : * NB: In this case we can't count on anything in so->markPos to be
490 : : * accurate.
491 : : */
492 : 26961 : so->currPos.itemIndex = so->markItemIndex;
493 : : }
494 : : else
495 : : {
496 : : /*
497 : : * The scan moved to a new page after last mark or restore, and we are
498 : : * now restoring to the marked page. We aren't holding any read
499 : : * locks, but if we're still holding the pin for the current position,
500 : : * we must drop it.
501 : : */
502 [ - + - - : 54 : if (BTScanPosIsValid(so->currPos))
+ - ]
503 : : {
504 : : /* Before leaving current page, deal with any killed items */
3308 kgrittn@postgresql.o 505 [ - + ]: 54 : if (so->numKilled > 0)
3308 kgrittn@postgresql.o 506 :UBC 0 : _bt_killitems(scan);
3308 kgrittn@postgresql.o 507 [ - + - - :CBC 54 : BTScanPosUnpinIfPinned(so->currPos);
- + ]
508 : : }
509 : :
6443 tgl@sss.pgh.pa.us 510 [ - + - - : 54 : if (BTScanPosIsValid(so->markPos))
+ - ]
511 : : {
512 : : /* bump pin on mark buffer for assignment to current buffer */
3308 kgrittn@postgresql.o 513 [ - + - - : 54 : if (BTScanPosIsPinned(so->markPos))
- + ]
3308 kgrittn@postgresql.o 514 :UBC 0 : IncrBufferRefCount(so->markPos.buf);
6443 tgl@sss.pgh.pa.us 515 :CBC 54 : memcpy(&so->currPos, &so->markPos,
516 : : offsetof(BTScanPosData, items[1]) +
517 : 54 : so->markPos.lastItem * sizeof(BTScanPosItem));
4571 518 [ - + ]: 54 : if (so->currTuples)
4571 tgl@sss.pgh.pa.us 519 :UBC 0 : memcpy(so->currTuples, so->markTuples,
520 : 0 : so->markPos.nextTupleOffset);
521 : : /* Reset the scan's array keys (see _bt_steppage for why) */
8 pg@bowt.ie 522 [ - + ]:GNC 54 : if (so->numArrayKeys)
523 : : {
8 pg@bowt.ie 524 :UNC 0 : _bt_start_array_keys(scan, so->currPos.dir);
525 : 0 : so->needPrimScan = false;
526 : : }
527 : : }
528 : : else
3308 kgrittn@postgresql.o 529 :UBC 0 : BTScanPosInvalidate(so->currPos);
530 : : }
10141 scrappy@hub.org 531 :CBC 27015 : }
532 : :
533 : : /*
534 : : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
535 : : */
536 : : Size
8 pg@bowt.ie 537 :GNC 29 : btestimateparallelscan(int nkeys, int norderbys)
538 : : {
539 : : /* Pessimistically assume all input scankeys will be output with arrays */
540 : 29 : return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
541 : : }
542 : :
543 : : /*
544 : : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
545 : : */
546 : : void
2615 rhaas@postgresql.org 547 :CBC 29 : btinitparallelscan(void *target)
548 : : {
549 : 29 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
550 : :
551 : 29 : SpinLockInit(&bt_target->btps_mutex);
552 : 29 : bt_target->btps_scanPage = InvalidBlockNumber;
553 : 29 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
554 : 29 : ConditionVariableInit(&bt_target->btps_cv);
555 : 29 : }
556 : :
557 : : /*
558 : : * btparallelrescan() -- reset parallel scan
559 : : */
560 : : void
561 : 12 : btparallelrescan(IndexScanDesc scan)
562 : : {
563 : : BTParallelScanDesc btscan;
564 : 12 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
565 : :
566 [ - + ]: 12 : Assert(parallel_scan);
567 : :
568 : 12 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
569 : : parallel_scan->ps_offset);
570 : :
571 : : /*
572 : : * In theory, we don't need to acquire the spinlock here, because there
573 : : * shouldn't be any other workers running at this point, but we do so for
574 : : * consistency.
575 : : */
576 [ - + ]: 12 : SpinLockAcquire(&btscan->btps_mutex);
577 : 12 : btscan->btps_scanPage = InvalidBlockNumber;
578 : 12 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
579 : 12 : SpinLockRelease(&btscan->btps_mutex);
580 : 12 : }
581 : :
582 : : /*
583 : : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
584 : : * page. Other scans must wait until we call _bt_parallel_release()
585 : : * or _bt_parallel_done().
586 : : *
587 : : * The return value is true if we successfully seized the scan and false
588 : : * if we did not. The latter case occurs if no pages remain.
589 : : *
590 : : * If the return value is true, *pageno returns the next or current page
591 : : * of the scan (depending on the scan direction). An invalid block number
592 : : * means the scan hasn't yet started, or that caller needs to start the next
593 : : * primitive index scan (if it's the latter case we'll set so.needPrimScan).
594 : : * The first time a participating process reaches the last page, it will return
595 : : * true and set *pageno to P_NONE; after that, further attempts to seize the
596 : : * scan will return false.
597 : : *
598 : : * Callers should ignore the value of pageno if the return value is false.
599 : : *
600 : : * Callers that are in a position to start a new primitive index scan must
601 : : * pass first=true (all other callers pass first=false). We just return false
602 : : * for first=false callers that require another primitive index scan.
603 : : */
604 : : bool
8 pg@bowt.ie 605 :GNC 882 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
606 : : {
2615 rhaas@postgresql.org 607 :CBC 882 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
608 : 882 : bool exit_loop = false;
609 : 882 : bool status = true;
610 : 882 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
611 : : BTParallelScanDesc btscan;
612 : :
613 : 882 : *pageno = P_NONE;
614 : :
8 pg@bowt.ie 615 [ + + ]:GNC 882 : if (first)
616 : : {
617 : : /*
618 : : * Initialize array related state when called from _bt_first, assuming
619 : : * that this will either be the first primitive index scan for the
620 : : * scan, or a previous explicitly scheduled primitive scan.
621 : : *
622 : : * Note: so->needPrimScan is only set when a scheduled primitive index
623 : : * scan is set to be performed in caller's worker process. It should
624 : : * not be set here by us for the first primitive scan, nor should we
625 : : * ever set it for a parallel scan that has no array keys.
626 : : */
627 : 217 : so->needPrimScan = false;
628 : 217 : so->scanBehind = false;
629 : : }
630 : : else
631 : : {
632 : : /*
633 : : * Don't attempt to seize the scan when backend requires another
634 : : * primitive index scan unless we're in a position to start it now
635 : : */
636 [ + + ]: 665 : if (so->needPrimScan)
637 : 18 : return false;
638 : : }
639 : :
2615 rhaas@postgresql.org 640 :CBC 864 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
641 : : parallel_scan->ps_offset);
642 : :
643 : : while (1)
644 : : {
645 [ + + ]: 962 : SpinLockAcquire(&btscan->btps_mutex);
646 : :
8 pg@bowt.ie 647 [ + + ]:GNC 962 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
648 : : {
649 : : /* We're done with this parallel index scan */
2615 rhaas@postgresql.org 650 :GBC 158 : status = false;
651 : : }
8 pg@bowt.ie 652 [ + + ]:GNC 804 : else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
653 : : {
654 [ - + ]: 18 : Assert(so->numArrayKeys);
655 : :
656 : : /*
657 : : * If we can start another primitive scan right away, do so.
658 : : * Otherwise just wait.
659 : : */
660 [ + - ]: 18 : if (first)
661 : : {
662 : 18 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
663 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
664 : : {
665 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
666 : 18 : ScanKey skey = &so->keyData[array->scan_key];
667 : :
668 : 18 : array->cur_elem = btscan->btps_arrElems[i];
669 : 18 : skey->sk_argument = array->elem_values[array->cur_elem];
670 : : }
671 : 18 : so->needPrimScan = true;
672 : 18 : so->scanBehind = false;
673 : 18 : *pageno = InvalidBlockNumber;
674 : 18 : exit_loop = true;
675 : : }
676 : : }
677 [ + + ]: 786 : else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
678 : : {
679 : : /*
680 : : * We have successfully seized control of the scan for the purpose
681 : : * of advancing it to a new page!
682 : : */
2615 rhaas@postgresql.org 683 :CBC 688 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
684 : 688 : *pageno = btscan->btps_scanPage;
685 : 688 : exit_loop = true;
686 : : }
687 : 962 : SpinLockRelease(&btscan->btps_mutex);
688 [ + + + + ]: 962 : if (exit_loop || !status)
689 : : break;
690 : 98 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
691 : : }
692 : 864 : ConditionVariableCancelSleep();
693 : :
694 : 864 : return status;
695 : : }
696 : :
697 : : /*
698 : : * _bt_parallel_release() -- Complete the process of advancing the scan to a
699 : : * new page. We now have the new value btps_scanPage; some other backend
700 : : * can now begin advancing the scan.
701 : : *
702 : : * Callers whose scan uses array keys must save their scan_page argument so
703 : : * that it can be passed to _bt_parallel_primscan_schedule, should caller
704 : : * determine that another primitive index scan is required. If that happens,
705 : : * scan_page won't be scanned by any backend (unless the next primitive index
706 : : * scan lands on scan_page).
707 : : */
708 : : void
709 : 665 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
710 : : {
711 : 665 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
712 : : BTParallelScanDesc btscan;
713 : :
714 : 665 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
715 : : parallel_scan->ps_offset);
716 : :
717 [ - + ]: 665 : SpinLockAcquire(&btscan->btps_mutex);
718 : 665 : btscan->btps_scanPage = scan_page;
719 : 665 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
720 : 665 : SpinLockRelease(&btscan->btps_mutex);
721 : 665 : ConditionVariableSignal(&btscan->btps_cv);
722 : 665 : }
723 : :
724 : : /*
725 : : * _bt_parallel_done() -- Mark the parallel scan as complete.
726 : : *
727 : : * When there are no pages left to scan, this function should be called to
728 : : * notify other workers. Otherwise, they might wait forever for the scan to
729 : : * advance to the next page.
730 : : */
731 : : void
732 : 2919355 : _bt_parallel_done(IndexScanDesc scan)
733 : : {
734 : 2919355 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
735 : : BTParallelScanDesc btscan;
736 : 2919355 : bool status_changed = false;
737 : :
738 : : /* Do nothing, for non-parallel scans */
739 [ + + ]: 2919355 : if (parallel_scan == NULL)
740 : 2919299 : return;
741 : :
742 : 56 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
743 : : parallel_scan->ps_offset);
744 : :
745 : : /*
746 : : * Mark the parallel scan as done, unless some other process did so
747 : : * already
748 : : */
749 [ + + ]: 56 : SpinLockAcquire(&btscan->btps_mutex);
8 pg@bowt.ie 750 [ + + ]:GNC 56 : if (btscan->btps_pageStatus != BTPARALLEL_DONE)
751 : : {
2615 rhaas@postgresql.org 752 :CBC 41 : btscan->btps_pageStatus = BTPARALLEL_DONE;
753 : 41 : status_changed = true;
754 : : }
755 : 56 : SpinLockRelease(&btscan->btps_mutex);
756 : :
757 : : /* wake up all the workers associated with this parallel scan */
758 [ + + ]: 56 : if (status_changed)
759 : 41 : ConditionVariableBroadcast(&btscan->btps_cv);
760 : : }
761 : :
762 : : /*
763 : : * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
764 : : *
765 : : * Caller passes the block number most recently passed to _bt_parallel_release
766 : : * by its backend. Caller successfully schedules the next primitive index scan
767 : : * if the shared parallel state hasn't been seized since caller's backend last
768 : : * advanced the scan.
769 : : */
770 : : void
8 pg@bowt.ie 771 :GNC 18 : _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
772 : : {
2615 rhaas@postgresql.org 773 :GBC 18 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
774 : 18 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
775 : : BTParallelScanDesc btscan;
776 : :
8 pg@bowt.ie 777 [ - + ]:GNC 18 : Assert(so->numArrayKeys);
778 : :
2615 rhaas@postgresql.org 779 :GBC 18 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
780 : : parallel_scan->ps_offset);
781 : :
782 [ - + ]: 18 : SpinLockAcquire(&btscan->btps_mutex);
8 pg@bowt.ie 783 [ + - ]:GNC 18 : if (btscan->btps_scanPage == prev_scan_page &&
784 [ + - ]: 18 : btscan->btps_pageStatus == BTPARALLEL_IDLE)
785 : : {
2615 rhaas@postgresql.org 786 :GBC 18 : btscan->btps_scanPage = InvalidBlockNumber;
8 pg@bowt.ie 787 :GNC 18 : btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
788 : :
789 : : /* Serialize scan's current array keys */
790 [ + + ]: 36 : for (int i = 0; i < so->numArrayKeys; i++)
791 : : {
792 : 18 : BTArrayKeyInfo *array = &so->arrayKeys[i];
793 : :
794 : 18 : btscan->btps_arrElems[i] = array->cur_elem;
795 : : }
796 : : }
2615 rhaas@postgresql.org 797 :GBC 18 : SpinLockRelease(&btscan->btps_mutex);
798 : 18 : }
799 : :
800 : : /*
801 : : * Bulk deletion of all index entries pointing to a set of heap tuples.
802 : : * The set of target tuples is specified via a callback routine that tells
803 : : * whether any given heap tuple (identified by ItemPointer) is being deleted.
804 : : *
805 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
806 : : */
807 : : IndexBulkDeleteResult *
3010 tgl@sss.pgh.pa.us 808 :CBC 1256 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
809 : : IndexBulkDeleteCallback callback, void *callback_state)
810 : : {
6557 811 : 1256 : Relation rel = info->index;
812 : : BTCycleId cycleid;
813 : :
814 : : /* allocate stats if first time through, else re-use existing struct */
6551 815 [ + - ]: 1256 : if (stats == NULL)
816 : 1256 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
817 : :
818 : : /* Establish the vacuum cycle ID to use for this scan */
819 : : /* The ENSURE stuff ensures we clean up shared memory on failure */
5842 820 [ + - ]: 1256 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
821 : : {
6551 822 : 1256 : cycleid = _bt_start_vacuum(rel);
823 : :
1444 pg@bowt.ie 824 : 1256 : btvacuumscan(info, stats, callback, callback_state, cycleid);
825 : : }
5842 tgl@sss.pgh.pa.us 826 [ - + ]: 1256 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
827 : 1256 : _bt_end_vacuum(rel);
828 : :
3010 829 : 1256 : return stats;
830 : : }
831 : :
832 : : /*
833 : : * Post-VACUUM cleanup.
834 : : *
835 : : * Result: a palloc'd struct containing statistical info for VACUUM displays.
836 : : */
837 : : IndexBulkDeleteResult *
838 : 101084 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
839 : : {
840 : : BlockNumber num_delpages;
841 : :
842 : : /* No-op in ANALYZE ONLY mode */
5500 843 [ + + ]: 101084 : if (info->analyze_only)
3010 844 : 7344 : return stats;
845 : :
846 : : /*
847 : : * If btbulkdelete was called, we need not do anything (we just maintain
848 : : * the information used within _bt_vacuum_needs_cleanup() by calling
849 : : * _bt_set_cleanup_info() below).
850 : : *
851 : : * If btbulkdelete was _not_ called, then we have a choice to make: we
852 : : * must decide whether or not a btvacuumscan() call is needed now (i.e.
853 : : * whether the ongoing VACUUM operation can entirely avoid a physical scan
854 : : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
855 : : * now.
856 : : */
6557 857 [ + + ]: 93740 : if (stats == NULL)
858 : : {
859 : : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
309 pg@bowt.ie 860 [ + + ]: 92774 : if (!_bt_vacuum_needs_cleanup(info->index))
2202 teodor@sigaev.ru 861 : 92768 : return NULL;
862 : :
863 : : /*
864 : : * Since we aren't going to actually delete any leaf items, there's no
865 : : * need to go through all the vacuum-cycle-ID pushups here.
866 : : *
867 : : * Posting list tuples are a source of inaccuracy for cleanup-only
868 : : * scans. btvacuumscan() will assume that the number of index tuples
869 : : * from each page can be used as num_index_tuples, even though
870 : : * num_index_tuples is supposed to represent the number of TIDs in the
871 : : * index. This naive approach can underestimate the number of tuples
872 : : * in the index significantly.
873 : : *
874 : : * We handle the problem by making num_index_tuples an estimate in
875 : : * cleanup-only case.
876 : : */
6557 tgl@sss.pgh.pa.us 877 : 6 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1444 pg@bowt.ie 878 : 6 : btvacuumscan(info, stats, NULL, NULL, 0);
1131 879 : 6 : stats->estimated_count = true;
880 : : }
881 : :
882 : : /*
883 : : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
884 : : *
885 : : * num_delpages is the number of deleted pages now in the index that were
886 : : * not safe to place in the FSM to be recycled just yet. num_delpages is
887 : : * greater than 0 only when _bt_pagedel() actually deleted pages during
888 : : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
889 : : * have failed to place any newly deleted pages in the FSM just moments
890 : : * ago. (Actually, there are edge cases where recycling of the current
891 : : * VACUUM's newly deleted pages does not even become safe by the time the
892 : : * next VACUUM comes around. See nbtree/README.)
893 : : */
1145 894 [ - + ]: 972 : Assert(stats->pages_deleted >= stats->pages_free);
895 : 972 : num_delpages = stats->pages_deleted - stats->pages_free;
309 896 : 972 : _bt_set_cleanup_info(info->index, num_delpages);
897 : :
898 : : /*
899 : : * It's quite possible for us to be fooled by concurrent page splits into
900 : : * double-counting some index tuples, so disbelieve any total that exceeds
901 : : * the underlying heap's count ... if we know that accurately. Otherwise
902 : : * this might just make matters worse.
903 : : */
5179 tgl@sss.pgh.pa.us 904 [ + + ]: 972 : if (!info->estimated_count)
905 : : {
6551 906 [ + + ]: 946 : if (stats->num_index_tuples > info->num_heap_tuples)
907 : 19 : stats->num_index_tuples = info->num_heap_tuples;
908 : : }
909 : :
3010 910 : 972 : return stats;
911 : : }
912 : :
913 : : /*
914 : : * btvacuumscan --- scan the index for VACUUMing purposes
915 : : *
916 : : * This combines the functions of looking for leaf tuples that are deletable
917 : : * according to the vacuum callback, looking for empty pages that can be
918 : : * deleted, and looking for old deleted pages that can be recycled. Both
919 : : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
920 : : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
921 : : *
922 : : * The caller is responsible for initially allocating/zeroing a stats struct
923 : : * and for obtaining a vacuum cycle ID if necessary.
924 : : */
925 : : static void
6551 926 : 1262 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
927 : : IndexBulkDeleteCallback callback, void *callback_state,
928 : : BTCycleId cycleid)
929 : : {
930 : 1262 : Relation rel = info->index;
931 : : BTVacState vstate;
932 : : BlockNumber num_pages;
933 : : BlockNumber scanblkno;
934 : : bool needLock;
935 : :
936 : : /*
937 : : * Reset fields that track information about the entire index now. This
938 : : * avoids double-counting in the case where a single VACUUM command
939 : : * requires multiple scans of the index.
940 : : *
941 : : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
942 : : * since they track information about the VACUUM command, and so must last
943 : : * across each call to btvacuumscan().
944 : : *
945 : : * (Note that pages_free is treated as state about the whole index, not
946 : : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
947 : : * calls are idempotent, and get repeated for the same deleted pages in
948 : : * some scenarios. The point for us is to track the number of recyclable
949 : : * pages in the index at the end of the VACUUM command.)
950 : : */
1145 pg@bowt.ie 951 : 1262 : stats->num_pages = 0;
6551 tgl@sss.pgh.pa.us 952 : 1262 : stats->num_index_tuples = 0;
953 : 1262 : stats->pages_deleted = 0;
1145 pg@bowt.ie 954 : 1262 : stats->pages_free = 0;
955 : :
956 : : /* Set up info to pass down to btvacuumpage */
6551 tgl@sss.pgh.pa.us 957 : 1262 : vstate.info = info;
958 : 1262 : vstate.stats = stats;
959 : 1262 : vstate.callback = callback;
960 : 1262 : vstate.callback_state = callback_state;
961 : 1262 : vstate.cycleid = cycleid;
962 : :
963 : : /* Create a temporary memory context to run _bt_pagedel in */
964 : 1262 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
965 : : "_bt_pagedel",
966 : : ALLOCSET_DEFAULT_SIZES);
967 : :
968 : : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
1120 pg@bowt.ie 969 : 1262 : vstate.bufsize = 0;
970 : 1262 : vstate.maxbufsize = 0;
971 : 1262 : vstate.pendingpages = NULL;
972 : 1262 : vstate.npendingpages = 0;
973 : : /* Consider applying _bt_pendingfsm_finalize optimization */
974 : 1262 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
975 : :
976 : : /*
977 : : * The outer loop iterates over all index pages except the metapage, in
978 : : * physical order (we hope the kernel will cooperate in providing
979 : : * read-ahead for speed). It is critical that we visit all leaf pages,
980 : : * including ones added after we start the scan, else we might fail to
981 : : * delete some deletable tuples. Hence, we must repeatedly check the
982 : : * relation length. We must acquire the relation-extension lock while
983 : : * doing so to avoid a race condition: if someone else is extending the
984 : : * relation, there is a window where bufmgr/smgr have created a new
985 : : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
986 : : * we manage to scan such a page here, we'll improperly assume it can be
987 : : * recycled. Taking the lock synchronizes things enough to prevent a
988 : : * problem: either num_pages won't include the new page, or _bt_getbuf
989 : : * already has write lock on the buffer and it will be fully initialized
990 : : * before we can examine it. Also, we need not worry if a page is added
991 : : * immediately after we look; the page splitting code already has
992 : : * write-lock on the left page before it adds a right page, so we must
993 : : * already have processed any tuples due to be moved into such a page.
994 : : *
995 : : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
996 : : * think the use of the extension lock is still required.
997 : : *
998 : : * We can skip locking for new or temp relations, however, since no one
999 : : * else could be accessing them.
1000 : : */
6551 tgl@sss.pgh.pa.us 1001 [ + + + - ]: 1262 : needLock = !RELATION_IS_LOCAL(rel);
1002 : :
1443 pg@bowt.ie 1003 : 1262 : scanblkno = BTREE_METAPAGE + 1;
1004 : : for (;;)
1005 : : {
1006 : : /* Get the current relation length */
6551 tgl@sss.pgh.pa.us 1007 [ + + ]: 2401 : if (needLock)
1008 : 2399 : LockRelationForExtension(rel, ExclusiveLock);
1009 : 2401 : num_pages = RelationGetNumberOfBlocks(rel);
1010 [ + + ]: 2401 : if (needLock)
1011 : 2399 : UnlockRelationForExtension(rel, ExclusiveLock);
1012 : :
1839 alvherre@alvh.no-ip. 1013 [ + + ]: 2401 : if (info->report_progress)
1014 : 456 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1015 : : num_pages);
1016 : :
1017 : : /* Quit if we've scanned the whole relation */
1443 pg@bowt.ie 1018 [ + + ]: 2401 : if (scanblkno >= num_pages)
6551 tgl@sss.pgh.pa.us 1019 : 1262 : break;
1020 : : /* Iterate over pages, then loop back to recheck length */
1443 pg@bowt.ie 1021 [ + + ]: 11830 : for (; scanblkno < num_pages; scanblkno++)
1022 : : {
1023 : 10691 : btvacuumpage(&vstate, scanblkno);
1839 alvherre@alvh.no-ip. 1024 [ + + ]: 10691 : if (info->report_progress)
1025 : 211 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1026 : : scanblkno);
1027 : : }
1028 : : }
1029 : :
1030 : : /* Set statistics num_pages field to final size of index */
1145 pg@bowt.ie 1031 : 1262 : stats->num_pages = num_pages;
1032 : :
6551 tgl@sss.pgh.pa.us 1033 : 1262 : MemoryContextDelete(vstate.pagedelcontext);
1034 : :
1035 : : /*
1036 : : * If there were any calls to _bt_pagedel() during scan of the index then
1037 : : * see if any of the resulting pages can be placed in the FSM now. When
1038 : : * it's not safe we'll have to leave it up to a future VACUUM operation.
1039 : : *
1040 : : * Finally, if we placed any pages in the FSM (either just now or during
1041 : : * the scan), forcibly update the upper-level FSM pages to ensure that
1042 : : * searchers can find them.
1043 : : */
1120 pg@bowt.ie 1044 : 1262 : _bt_pendingfsm_finalize(rel, &vstate);
1145 1045 [ + + ]: 1262 : if (stats->pages_free > 0)
2207 tgl@sss.pgh.pa.us 1046 : 22 : IndexFreeSpaceMapVacuum(rel);
6551 1047 : 1262 : }
1048 : :
1049 : : /*
1050 : : * btvacuumpage --- VACUUM one page
1051 : : *
1052 : : * This processes a single page for btvacuumscan(). In some cases we must
1053 : : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1054 : : * a previous call here. This is how we handle page splits (that happened
1055 : : * after our cycleid was acquired) whose right half page happened to reuse
1056 : : * a block that we might have processed at some point before it was
1057 : : * recycled (i.e. before the page split).
1058 : : */
1059 : : static void
1443 pg@bowt.ie 1060 : 10691 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1061 : : {
6551 tgl@sss.pgh.pa.us 1062 : 10691 : IndexVacuumInfo *info = vstate->info;
1063 : 10691 : IndexBulkDeleteResult *stats = vstate->stats;
1064 : 10691 : IndexBulkDeleteCallback callback = vstate->callback;
1065 : 10691 : void *callback_state = vstate->callback_state;
1066 : 10691 : Relation rel = info->index;
377 pg@bowt.ie 1067 : 10691 : Relation heaprel = info->heaprel;
1068 : : bool attempt_pagedel;
1069 : : BlockNumber blkno,
1070 : : backtrack_to;
1071 : : Buffer buf;
1072 : : Page page;
1073 : : BTPageOpaque opaque;
1074 : :
1443 1075 : 10691 : blkno = scanblkno;
1076 : :
1077 : 10691 : backtrack:
1078 : :
1079 : 10691 : attempt_pagedel = false;
1080 : 10691 : backtrack_to = P_NONE;
1081 : :
1082 : : /* call vacuum_delay_point while not holding any buffer lock */
6551 tgl@sss.pgh.pa.us 1083 : 10691 : vacuum_delay_point();
1084 : :
1085 : : /*
1086 : : * We can't use _bt_getbuf() here because it always applies
1087 : : * _bt_checkpage(), which will barf on an all-zero page. We want to
1088 : : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1089 : : * buffer access strategy.
1090 : : */
5644 heikki.linnakangas@i 1091 : 10691 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1092 : : info->strategy);
1363 pg@bowt.ie 1093 : 10691 : _bt_lockbuf(rel, buf, BT_READ);
2916 kgrittn@postgresql.o 1094 : 10691 : page = BufferGetPage(buf);
1443 pg@bowt.ie 1095 : 10691 : opaque = NULL;
6551 tgl@sss.pgh.pa.us 1096 [ + - ]: 10691 : if (!PageIsNew(page))
1097 : : {
1098 : 10691 : _bt_checkpage(rel, buf);
744 michael@paquier.xyz 1099 : 10691 : opaque = BTPageGetOpaque(page);
1100 : : }
1101 : :
1443 pg@bowt.ie 1102 [ - + ]: 10691 : Assert(blkno <= scanblkno);
1103 [ - + ]: 10691 : if (blkno != scanblkno)
1104 : : {
1105 : : /*
1106 : : * We're backtracking.
1107 : : *
1108 : : * We followed a right link to a sibling leaf page (a page that
1109 : : * happens to be from a block located before scanblkno). The only
1110 : : * case we want to do anything with is a live leaf page having the
1111 : : * current vacuum cycle ID.
1112 : : *
1113 : : * The page had better be in a state that's consistent with what we
1114 : : * expect. Check for conditions that imply corruption in passing. It
1115 : : * can't be half-dead because only an interrupted VACUUM process can
1116 : : * leave pages in that state, so we'd definitely have dealt with it
1117 : : * back when the page was the scanblkno page (half-dead pages are
1118 : : * always marked fully deleted by _bt_pagedel(), barring corruption).
1119 : : */
1443 pg@bowt.ie 1120 [ # # # # :UBC 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
# # ]
1121 : : {
1122 : 0 : Assert(false);
1123 : : ereport(LOG,
1124 : : (errcode(ERRCODE_INDEX_CORRUPTED),
1125 : : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1126 : : blkno, scanblkno, RelationGetRelationName(rel))));
1127 : : _bt_relbuf(rel, buf);
1128 : : return;
1129 : : }
1130 : :
1131 : : /*
1132 : : * We may have already processed the page in an earlier call, when the
1133 : : * page was scanblkno. This happens when the leaf page split occurred
1134 : : * after the scan began, but before the right sibling page became the
1135 : : * scanblkno.
1136 : : *
1137 : : * Page may also have been deleted by current btvacuumpage() call,
1138 : : * since _bt_pagedel() sometimes deletes the right sibling page of
1139 : : * scanblkno in passing (it does so after we decided where to
1140 : : * backtrack to). We don't need to process this page as a deleted
1141 : : * page a second time now (in fact, it would be wrong to count it as a
1142 : : * deleted page in the bulk delete statistics a second time).
1143 : : */
1144 [ # # # # ]: 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1145 : : {
1146 : : /* Done with current scanblkno (and all lower split pages) */
6551 tgl@sss.pgh.pa.us 1147 : 0 : _bt_relbuf(rel, buf);
1148 : 0 : return;
1149 : : }
1150 : : }
1151 : :
377 pg@bowt.ie 1152 [ + - + + ]:CBC 10691 : if (!opaque || BTPageIsRecyclable(page, heaprel))
1153 : : {
1154 : : /* Okay to recycle this page (which could be leaf or internal) */
5675 heikki.linnakangas@i 1155 : 146 : RecordFreeIndexPage(rel, blkno);
6551 tgl@sss.pgh.pa.us 1156 : 146 : stats->pages_deleted++;
1145 pg@bowt.ie 1157 : 146 : stats->pages_free++;
1158 : : }
6551 tgl@sss.pgh.pa.us 1159 [ + + ]: 10545 : else if (P_ISDELETED(opaque))
1160 : : {
1161 : : /*
1162 : : * Already deleted page (which could be leaf or internal). Can't
1163 : : * recycle yet.
1164 : : */
1165 : 97 : stats->pages_deleted++;
1166 : : }
6374 1167 [ - + ]: 10448 : else if (P_ISHALFDEAD(opaque))
1168 : : {
1169 : : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
1144 pg@bowt.ie 1170 :UBC 0 : attempt_pagedel = true;
1171 : :
1172 : : /*
1173 : : * _bt_pagedel() will increment both pages_newly_deleted and
1174 : : * pages_deleted stats in all cases (barring corruption)
1175 : : */
1176 : : }
6551 tgl@sss.pgh.pa.us 1177 [ + + ]:CBC 10448 : else if (P_ISLEAF(opaque))
1178 : : {
1179 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
1180 : : int ndeletable;
1181 : : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1182 : : int nupdatable;
1183 : : OffsetNumber offnum,
1184 : : minoff,
1185 : : maxoff;
1186 : : int nhtidsdead,
1187 : : nhtidslive;
1188 : :
1189 : : /*
1190 : : * Trade in the initial read lock for a full cleanup lock on this
1191 : : * page. We must get such a lock on every leaf page over the course
1192 : : * of the vacuum scan, whether or not it actually contains any
1193 : : * deletable tuples --- see nbtree/README.
1194 : : */
1363 pg@bowt.ie 1195 : 9835 : _bt_upgradelockbufcleanup(rel, buf);
1196 : :
1197 : : /*
1198 : : * Check whether we need to backtrack to earlier pages. What we are
1199 : : * concerned about is a page split that happened since we started the
1200 : : * vacuum scan. If the split moved tuples on the right half of the
1201 : : * split (i.e. the tuples that sort high) to a block that we already
1202 : : * passed over, then we might have missed the tuples. We need to
1203 : : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1204 : : * or deleting scanblkno page below!)
1205 : : */
6551 tgl@sss.pgh.pa.us 1206 [ + + ]: 9835 : if (vstate->cycleid != 0 &&
1207 [ + + ]: 9769 : opaque->btpo_cycleid == vstate->cycleid &&
1208 [ + - ]: 2 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1209 [ + + ]: 2 : !P_RIGHTMOST(opaque) &&
1443 pg@bowt.ie 1210 [ - + ]: 1 : opaque->btpo_next < scanblkno)
1443 pg@bowt.ie 1211 :UBC 0 : backtrack_to = opaque->btpo_next;
1212 : :
6551 tgl@sss.pgh.pa.us 1213 :CBC 9835 : ndeletable = 0;
1509 pg@bowt.ie 1214 : 9835 : nupdatable = 0;
6551 tgl@sss.pgh.pa.us 1215 [ + + ]: 9835 : minoff = P_FIRSTDATAKEY(opaque);
1216 : 9835 : maxoff = PageGetMaxOffsetNumber(page);
1509 pg@bowt.ie 1217 : 9835 : nhtidsdead = 0;
1218 : 9835 : nhtidslive = 0;
6551 tgl@sss.pgh.pa.us 1219 [ + + ]: 9835 : if (callback)
1220 : : {
1221 : : /* btbulkdelete callback tells us what to delete (or update) */
1222 : 9769 : for (offnum = minoff;
1223 [ + + ]: 1876353 : offnum <= maxoff;
1224 : 1866584 : offnum = OffsetNumberNext(offnum))
1225 : : {
1226 : : IndexTuple itup;
1227 : :
1228 : 1866584 : itup = (IndexTuple) PageGetItem(page,
1229 : : PageGetItemId(page, offnum));
1230 : :
1509 pg@bowt.ie 1231 [ - + ]: 1866584 : Assert(!BTreeTupleIsPivot(itup));
1232 [ + + ]: 1866584 : if (!BTreeTupleIsPosting(itup))
1233 : : {
1234 : : /* Regular tuple, standard table TID representation */
1235 [ + + ]: 1801560 : if (callback(&itup->t_tid, callback_state))
1236 : : {
1237 : 829905 : deletable[ndeletable++] = offnum;
1238 : 829905 : nhtidsdead++;
1239 : : }
1240 : : else
1241 : 971655 : nhtidslive++;
1242 : : }
1243 : : else
1244 : : {
1245 : : BTVacuumPosting vacposting;
1246 : : int nremaining;
1247 : :
1248 : : /* Posting list tuple */
1249 : 65024 : vacposting = btreevacuumposting(vstate, itup, offnum,
1250 : : &nremaining);
1251 [ + + ]: 65024 : if (vacposting == NULL)
1252 : : {
1253 : : /*
1254 : : * All table TIDs from the posting tuple remain, so no
1255 : : * delete or update required
1256 : : */
1257 [ - + ]: 22126 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1258 : : }
1259 [ + + ]: 42898 : else if (nremaining > 0)
1260 : : {
1261 : :
1262 : : /*
1263 : : * Store metadata about posting list tuple in
1264 : : * updatable array for entire page. Existing tuple
1265 : : * will be updated during the later call to
1266 : : * _bt_delitems_vacuum().
1267 : : */
1268 [ - + ]: 18041 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1269 : 18041 : updatable[nupdatable++] = vacposting;
1270 : 18041 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1271 : : }
1272 : : else
1273 : : {
1274 : : /*
1275 : : * All table TIDs from the posting list must be
1276 : : * deleted. We'll delete the index tuple completely
1277 : : * (no update required).
1278 : : */
1279 [ - + ]: 24857 : Assert(nremaining == 0);
1280 : 24857 : deletable[ndeletable++] = offnum;
1281 : 24857 : nhtidsdead += BTreeTupleGetNPosting(itup);
1282 : 24857 : pfree(vacposting);
1283 : : }
1284 : :
1285 : 65024 : nhtidslive += nremaining;
1286 : : }
1287 : : }
1288 : : }
1289 : :
1290 : : /*
1291 : : * Apply any needed deletes or updates. We issue just one
1292 : : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1293 : : */
1294 [ + + + + ]: 9835 : if (ndeletable > 0 || nupdatable > 0)
1295 : : {
1443 1296 [ - + ]: 6767 : Assert(nhtidsdead >= ndeletable + nupdatable);
1509 1297 : 6767 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1298 : : nupdatable);
1299 : :
1300 : 6767 : stats->tuples_removed += nhtidsdead;
1301 : : /* must recompute maxoff */
6551 tgl@sss.pgh.pa.us 1302 : 6767 : maxoff = PageGetMaxOffsetNumber(page);
1303 : :
1304 : : /* can't leak memory here */
1509 pg@bowt.ie 1305 [ + + ]: 24808 : for (int i = 0; i < nupdatable; i++)
1306 : 18041 : pfree(updatable[i]);
1307 : : }
1308 : : else
1309 : : {
1310 : : /*
1311 : : * If the leaf page has been split during this vacuum cycle, it
1312 : : * seems worth expending a write to clear btpo_cycleid even if we
1313 : : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1314 : : * takes care of this.) This ensures we won't process the page
1315 : : * again.
1316 : : *
1317 : : * We treat this like a hint-bit update because there's no need to
1318 : : * WAL-log it.
1319 : : */
1320 [ - + ]: 3068 : Assert(nhtidsdead == 0);
6551 tgl@sss.pgh.pa.us 1321 [ + + ]: 3068 : if (vstate->cycleid != 0 &&
1322 [ + + ]: 3002 : opaque->btpo_cycleid == vstate->cycleid)
1323 : : {
1324 : 1 : opaque->btpo_cycleid = 0;
3954 jdavis@postgresql.or 1325 : 1 : MarkBufferDirtyHint(buf, true);
1326 : : }
1327 : : }
1328 : :
1329 : : /*
1330 : : * If the leaf page is now empty, try to delete it; else count the
1331 : : * live tuples (live table TIDs in posting lists are counted as
1332 : : * separate live tuples). We don't delete when backtracking, though,
1333 : : * since that would require teaching _bt_pagedel() about backtracking
1334 : : * (doesn't seem worth adding more complexity to deal with that).
1335 : : *
1336 : : * We don't count the number of live TIDs during cleanup-only calls to
1337 : : * btvacuumscan (i.e. when callback is not set). We count the number
1338 : : * of index tuples directly instead. This avoids the expense of
1339 : : * directly examining all of the tuples on each page. VACUUM will
1340 : : * treat num_index_tuples as an estimate in cleanup-only case, so it
1341 : : * doesn't matter that this underestimates num_index_tuples
1342 : : * significantly in some cases.
1343 : : */
6551 tgl@sss.pgh.pa.us 1344 [ + + ]: 9835 : if (minoff > maxoff)
1443 pg@bowt.ie 1345 : 2991 : attempt_pagedel = (blkno == scanblkno);
1257 1346 [ + + ]: 6844 : else if (callback)
1509 1347 : 6782 : stats->num_index_tuples += nhtidslive;
1348 : : else
1257 1349 : 62 : stats->num_index_tuples += maxoff - minoff + 1;
1350 : :
1443 1351 [ + + - + ]: 9835 : Assert(!attempt_pagedel || nhtidslive == 0);
1352 : : }
1353 : :
1354 [ + + ]: 10691 : if (attempt_pagedel)
1355 : : {
1356 : : MemoryContext oldcontext;
1357 : :
1358 : : /* Run pagedel in a temp context to avoid memory leakage */
6551 tgl@sss.pgh.pa.us 1359 : 2991 : MemoryContextReset(vstate->pagedelcontext);
1360 : 2991 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1361 : :
1362 : : /*
1363 : : * _bt_pagedel maintains the bulk delete stats on our behalf;
1364 : : * pages_newly_deleted and pages_deleted are likely to be incremented
1365 : : * during call
1366 : : */
1443 pg@bowt.ie 1367 [ - + ]: 2991 : Assert(blkno == scanblkno);
1144 1368 : 2991 : _bt_pagedel(rel, buf, vstate);
1369 : :
6551 tgl@sss.pgh.pa.us 1370 : 2991 : MemoryContextSwitchTo(oldcontext);
1371 : : /* pagedel released buffer, so we shouldn't */
1372 : : }
1373 : : else
1374 : 7700 : _bt_relbuf(rel, buf);
1375 : :
1443 pg@bowt.ie 1376 [ - + ]: 10691 : if (backtrack_to != P_NONE)
1377 : : {
1443 pg@bowt.ie 1378 :UBC 0 : blkno = backtrack_to;
1379 : 0 : goto backtrack;
1380 : : }
1381 : : }
1382 : :
1383 : : /*
1384 : : * btreevacuumposting --- determine TIDs still needed in posting list
1385 : : *
1386 : : * Returns metadata describing how to build replacement tuple without the TIDs
1387 : : * that VACUUM needs to delete. Returned value is NULL in the common case
1388 : : * where no changes are needed to caller's posting list tuple (we avoid
1389 : : * allocating memory here as an optimization).
1390 : : *
1391 : : * The number of TIDs that should remain in the posting list tuple is set for
1392 : : * caller in *nremaining.
1393 : : */
1394 : : static BTVacuumPosting
1509 pg@bowt.ie 1395 :CBC 65024 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1396 : : OffsetNumber updatedoffset, int *nremaining)
1397 : : {
1398 : 65024 : int live = 0;
1399 : 65024 : int nitem = BTreeTupleGetNPosting(posting);
1400 : 65024 : ItemPointer items = BTreeTupleGetPosting(posting);
1401 : 65024 : BTVacuumPosting vacposting = NULL;
1402 : :
1403 [ + + ]: 367917 : for (int i = 0; i < nitem; i++)
1404 : : {
1405 [ + + ]: 302893 : if (!vstate->callback(items + i, vstate->callback_state))
1406 : : {
1407 : : /* Live table TID */
1408 : 148860 : live++;
1409 : : }
1410 [ + + ]: 154033 : else if (vacposting == NULL)
1411 : : {
1412 : : /*
1413 : : * First dead table TID encountered.
1414 : : *
1415 : : * It's now clear that we need to delete one or more dead table
1416 : : * TIDs, so start maintaining metadata describing how to update
1417 : : * existing posting list tuple.
1418 : : */
1419 : 42898 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1420 : : nitem * sizeof(uint16));
1421 : :
1422 : 42898 : vacposting->itup = posting;
1423 : 42898 : vacposting->updatedoffset = updatedoffset;
1424 : 42898 : vacposting->ndeletedtids = 0;
1425 : 42898 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1426 : : }
1427 : : else
1428 : : {
1429 : : /* Second or subsequent dead table TID */
1430 : 111135 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1431 : : }
1432 : : }
1433 : :
1434 : 65024 : *nremaining = live;
1435 : 65024 : return vacposting;
1436 : : }
1437 : :
1438 : : /*
1439 : : * btcanreturn() -- Check whether btree indexes support index-only scans.
1440 : : *
1441 : : * btrees always do, so this is trivial.
1442 : : */
1443 : : bool
3010 tgl@sss.pgh.pa.us 1444 : 453677 : btcanreturn(Relation index, int attno)
1445 : : {
1446 : 453677 : return true;
1447 : : }
|