Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * nbtree.c
4 : * Implementation of Lehman and Yao's btree management algorithm for
5 : * Postgres.
6 : *
7 : * NOTES
8 : * This file contains only the public interface routines.
9 : *
10 : *
11 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
12 : * Portions Copyright (c) 1994, Regents of the University of California
13 : *
14 : * IDENTIFICATION
15 : * src/backend/access/nbtree/nbtree.c
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 : #include "postgres.h"
20 :
21 : #include "access/nbtree.h"
22 : #include "access/nbtxlog.h"
23 : #include "access/relscan.h"
24 : #include "access/xlog.h"
25 : #include "access/xloginsert.h"
26 : #include "commands/progress.h"
27 : #include "commands/vacuum.h"
28 : #include "miscadmin.h"
29 : #include "nodes/execnodes.h"
30 : #include "pgstat.h"
31 : #include "postmaster/autovacuum.h"
32 : #include "storage/condition_variable.h"
33 : #include "storage/indexfsm.h"
34 : #include "storage/ipc.h"
35 : #include "storage/lmgr.h"
36 : #include "storage/smgr.h"
37 : #include "utils/builtins.h"
38 : #include "utils/index_selfuncs.h"
39 : #include "utils/memutils.h"
40 :
41 :
42 : /*
43 : * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
44 : *
45 : * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
46 : * a new page; others must wait.
47 : *
48 : * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan
49 : * to a new page; some process can start doing that.
50 : *
51 : * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
52 : * We reach this state once for every distinct combination of array keys.
53 : */
54 : typedef enum
55 : {
56 : BTPARALLEL_NOT_INITIALIZED,
57 : BTPARALLEL_ADVANCING,
58 : BTPARALLEL_IDLE,
59 : BTPARALLEL_DONE
60 : } BTPS_State;
61 :
62 : /*
63 : * BTParallelScanDescData contains btree specific shared information required
64 : * for parallel scan.
65 : */
66 : typedef struct BTParallelScanDescData
67 : {
68 : BlockNumber btps_scanPage; /* latest or next page to be scanned */
69 : BTPS_State btps_pageStatus; /* indicates whether next page is
70 : * available for scan. see above for
71 : * possible states of parallel scan. */
72 : int btps_arrayKeyCount; /* count indicating number of array scan
73 : * keys processed by parallel scan */
74 : slock_t btps_mutex; /* protects above variables */
75 : ConditionVariable btps_cv; /* used to synchronize parallel scan */
76 : } BTParallelScanDescData;
77 :
78 : typedef struct BTParallelScanDescData *BTParallelScanDesc;
79 :
80 :
81 : static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
82 : IndexBulkDeleteCallback callback, void *callback_state,
83 : BTCycleId cycleid);
84 : static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
85 : static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
86 : IndexTuple posting,
87 : OffsetNumber updatedoffset,
88 : int *nremaining);
89 :
90 :
91 : /*
92 : * Btree handler function: return IndexAmRoutine with access method parameters
93 : * and callbacks.
94 : */
95 : Datum
2639 tgl 96 CBC 1158799 : bthandler(PG_FUNCTION_ARGS)
97 : {
98 1158799 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
99 :
2537 teodor 100 1158799 : amroutine->amstrategies = BTMaxStrategyNumber;
101 1158799 : amroutine->amsupport = BTNProcs;
1105 akorotkov 102 1158799 : amroutine->amoptsprocnum = BTOPTIONS_PROC;
2639 tgl 103 1158799 : amroutine->amcanorder = true;
104 1158799 : amroutine->amcanorderbyop = false;
105 1158799 : amroutine->amcanbackward = true;
106 1158799 : amroutine->amcanunique = true;
107 1158799 : amroutine->amcanmulticol = true;
108 1158799 : amroutine->amoptionalkey = true;
109 1158799 : amroutine->amsearcharray = true;
110 1158799 : amroutine->amsearchnulls = true;
111 1158799 : amroutine->amstorage = false;
112 1158799 : amroutine->amclusterable = true;
113 1158799 : amroutine->ampredlocks = true;
2244 rhaas 114 1158799 : amroutine->amcanparallel = true;
1828 teodor 115 1158799 : amroutine->amcaninclude = true;
1180 akapila 116 1158799 : amroutine->amusemaintenanceworkmem = false;
20 tomas.vondra 117 GNC 1158799 : amroutine->amsummarizing = false;
1180 akapila 118 CBC 1158799 : amroutine->amparallelvacuumoptions =
1180 akapila 119 ECB : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
2639 tgl 120 GIC 1158799 : amroutine->amkeytype = InvalidOid;
2639 tgl 121 ECB :
2639 tgl 122 GIC 1158799 : amroutine->ambuild = btbuild;
2639 tgl 123 CBC 1158799 : amroutine->ambuildempty = btbuildempty;
124 1158799 : amroutine->aminsert = btinsert;
125 1158799 : amroutine->ambulkdelete = btbulkdelete;
126 1158799 : amroutine->amvacuumcleanup = btvacuumcleanup;
127 1158799 : amroutine->amcanreturn = btcanreturn;
128 1158799 : amroutine->amcostestimate = btcostestimate;
129 1158799 : amroutine->amoptions = btoptions;
2430 130 1158799 : amroutine->amproperty = btproperty;
1468 alvherre 131 1158799 : amroutine->ambuildphasename = btbuildphasename;
2639 tgl 132 1158799 : amroutine->amvalidate = btvalidate;
981 133 1158799 : amroutine->amadjustmembers = btadjustmembers;
2639 134 1158799 : amroutine->ambeginscan = btbeginscan;
135 1158799 : amroutine->amrescan = btrescan;
136 1158799 : amroutine->amgettuple = btgettuple;
137 1158799 : amroutine->amgetbitmap = btgetbitmap;
138 1158799 : amroutine->amendscan = btendscan;
139 1158799 : amroutine->ammarkpos = btmarkpos;
140 1158799 : amroutine->amrestrpos = btrestrpos;
2244 rhaas 141 1158799 : amroutine->amestimateparallelscan = btestimateparallelscan;
142 1158799 : amroutine->aminitparallelscan = btinitparallelscan;
143 1158799 : amroutine->amparallelrescan = btparallelrescan;
2639 tgl 144 ECB :
2639 tgl 145 GIC 1158799 : PG_RETURN_POINTER(amroutine);
2639 tgl 146 ECB : }
147 :
148 : /*
149 : * btbuildempty() -- build an empty btree index in the initialization fork
150 : */
151 : void
2639 tgl 152 GIC 62 : btbuildempty(Relation index)
4484 rhaas 153 ECB : {
154 : Page metapage;
155 :
156 : /* Construct metapage. */
1 tmunro 157 GNC 62 : metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
1138 pg 158 CBC 62 : _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
4484 rhaas 159 ECB :
160 : /*
161 : * Write the page and log it. It might seem that an immediate sync would
162 : * be sufficient to guarantee that the file exists on disk, but recovery
163 : * itself might remove it while replaying, for example, an
164 : * XLOG_DBASE_CREATE* or XLOG_TBLSPC_CREATE record. Therefore, we need
165 : * this even when wal_level=minimal.
166 : */
3670 simon 167 GIC 62 : PageSetChecksumInplace(metapage, BTREE_METAPAGE);
636 tgl 168 CBC 62 : smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE,
169 : metapage, true);
277 rhaas 170 GNC 62 : log_newpage(&RelationGetSmgr(index)->smgr_rlocator.locator, INIT_FORKNUM,
1983 tgl 171 ECB : BTREE_METAPAGE, metapage, true);
172 :
173 : /*
174 : * An immediate sync is required even if we xlog'd the page, because the
175 : * write did not go through shared_buffers and therefore a concurrent
176 : * checkpoint may have moved the redo pointer past our xlog record.
177 : */
636 tgl 178 GIC 62 : smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
4484 rhaas 179 CBC 62 : }
4484 rhaas 180 ECB :
181 : /*
182 : * btinsert() -- insert an index tuple into a btree.
183 : *
184 : * Descend the tree recursively, find the appropriate location for our
185 : * new tuple, and put it there.
186 : */
187 : bool
2639 tgl 188 GIC 7167159 : btinsert(Relation rel, Datum *values, bool *isnull,
2639 tgl 189 ECB : ItemPointer ht_ctid, Relation heapRel,
190 : IndexUniqueCheck checkUnique,
191 : bool indexUnchanged,
192 : IndexInfo *indexInfo)
193 : {
194 : bool result;
195 : IndexTuple itup;
196 :
197 : /* generate an index tuple */
6593 tgl 198 GIC 7167159 : itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
9345 bruce 199 CBC 7167159 : itup->t_tid = *ht_ctid;
9345 bruce 200 ECB :
816 pg 201 GIC 7167159 : result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel);
9345 bruce 202 ECB :
9345 bruce 203 GIC 7166901 : pfree(itup);
9345 bruce 204 ECB :
2639 tgl 205 GIC 7166901 : return result;
9770 scrappy 206 ECB : }
207 :
208 : /*
209 : * btgettuple() -- Get the next tuple in the scan.
210 : */
211 : bool
2639 tgl 212 GIC 21029716 : btgettuple(IndexScanDesc scan, ScanDirection dir)
9770 scrappy 213 ECB : {
7625 tgl 214 GIC 21029716 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
7625 tgl 215 ECB : bool res;
216 :
217 : /* btree indexes are never lossy */
5474 tgl 218 GIC 21029716 : scan->xs_recheck = false;
5474 tgl 219 ECB :
220 : /*
221 : * If we have any array keys, initialize them during first call for a
222 : * scan. We can't do this in btrescan because we don't know the scan
223 : * direction at that time.
224 : */
4193 tgl 225 GIC 21029716 : if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
4193 tgl 226 ECB : {
227 : /* punt if we have any unsatisfiable array keys */
4193 tgl 228 GIC 88 : if (so->numArrayKeys < 0)
2639 tgl 229 LBC 0 : return false;
4193 tgl 230 EUB :
4193 tgl 231 GIC 88 : _bt_start_array_keys(scan, dir);
4193 tgl 232 ECB : }
233 :
234 : /* This loop handles advancing to the next array elements, if any */
235 : do
236 : {
237 : /*
238 : * If we've already initialized this scan, we can just advance it in
239 : * the appropriate direction. If we haven't done so yet, we call
240 : * _bt_first() to get the first item in the scan.
241 : */
4193 tgl 242 GIC 21029962 : if (!BTScanPosIsValid(so->currPos))
4193 tgl 243 CBC 9080657 : res = _bt_first(scan, dir);
4193 tgl 244 ECB : else
245 : {
246 : /*
247 : * Check to see if we should kill the previously-fetched tuple.
248 : */
4193 tgl 249 GIC 11949305 : if (scan->kill_prior_tuple)
4193 tgl 250 ECB : {
251 : /*
252 : * Yes, remember it for later. (We'll deal with all such
253 : * tuples at once right before leaving the index page.) The
254 : * test for numKilled overrun is not just paranoia: if the
255 : * caller reverses direction in the indexscan then the same
256 : * item might get entered multiple times. It's not worth
257 : * trying to optimize that, so we don't detect it, but instead
258 : * just forget any excess entries.
259 : */
4193 tgl 260 GIC 263130 : if (so->killedItems == NULL)
4193 tgl 261 CBC 117431 : so->killedItems = (int *)
1138 pg 262 117431 : palloc(MaxTIDsPerBTreePage * sizeof(int));
263 263130 : if (so->numKilled < MaxTIDsPerBTreePage)
4193 tgl 264 263130 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
4193 tgl 265 ECB : }
266 :
267 : /*
268 : * Now continue the scan.
269 : */
4193 tgl 270 GIC 11949305 : res = _bt_next(scan, dir);
7625 tgl 271 ECB : }
272 :
273 : /* If we have a tuple, return it ... */
4193 tgl 274 GIC 21029962 : if (res)
4193 tgl 275 CBC 15809739 : break;
4193 tgl 276 ECB : /* ... otherwise see if we have more array keys to deal with */
4193 tgl 277 GIC 5220223 : } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
8986 bruce 278 ECB :
2639 tgl 279 GIC 21029716 : return res;
9770 scrappy 280 ECB : }
281 :
282 : /*
283 : * btgetbitmap() -- gets all matching tuples, and adds them to a bitmap
284 : */
285 : int64
2639 tgl 286 GIC 5496 : btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
6587 tgl 287 ECB : {
6587 tgl 288 GIC 5496 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
5477 tgl 289 CBC 5496 : int64 ntids = 0;
5477 tgl 290 ECB : ItemPointer heapTid;
291 :
292 : /*
293 : * If we have any array keys, initialize them.
294 : */
4193 tgl 295 GIC 5496 : if (so->numArrayKeys)
6587 tgl 296 ECB : {
297 : /* punt if we have any unsatisfiable array keys */
4193 tgl 298 GIC 222 : if (so->numArrayKeys < 0)
2639 tgl 299 LBC 0 : return ntids;
4193 tgl 300 EUB :
4193 tgl 301 GIC 222 : _bt_start_array_keys(scan, ForwardScanDirection);
6587 tgl 302 ECB : }
303 :
304 : /* This loop handles advancing to the next array elements, if any */
305 : do
306 : {
307 : /* Fetch the first page & tuple */
4193 tgl 308 GIC 6838 : if (_bt_first(scan, ForwardScanDirection))
6181 tgl 309 ECB : {
310 : /* Save tuple ID, and continue scanning */
1490 andres 311 GIC 5466 : heapTid = &scan->xs_heaptid;
4193 tgl 312 CBC 5466 : tbm_add_tuples(tbm, heapTid, 1, false);
313 5466 : ntids++;
4193 tgl 314 ECB :
315 : for (;;)
316 : {
317 : /*
318 : * Advance to next tuple within page. This is the same as the
319 : * easy case in _bt_next().
320 : */
4193 tgl 321 GIC 850887 : if (++so->currPos.itemIndex > so->currPos.lastItem)
4193 tgl 322 ECB : {
323 : /* let _bt_next do the heavy lifting */
4193 tgl 324 GIC 7583 : if (!_bt_next(scan, ForwardScanDirection))
4193 tgl 325 CBC 5466 : break;
4193 tgl 326 ECB : }
327 :
328 : /* Save tuple ID, and continue scanning */
4193 tgl 329 GIC 845421 : heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
4193 tgl 330 CBC 845421 : tbm_add_tuples(tbm, heapTid, 1, false);
331 845421 : ntids++;
4193 tgl 332 ECB : }
333 : }
334 : /* Now see if we have more array keys to deal with */
4193 tgl 335 GIC 6838 : } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
6587 tgl 336 ECB :
2639 tgl 337 GIC 5496 : return ntids;
6587 tgl 338 ECB : }
339 :
340 : /*
341 : * btbeginscan() -- start a scan on a btree index
342 : */
343 : IndexScanDesc
2639 tgl 344 GIC 8901539 : btbeginscan(Relation rel, int nkeys, int norderbys)
9770 scrappy 345 ECB : {
346 : IndexScanDesc scan;
347 : BTScanOpaque so;
348 :
349 : /* no order by operators allowed */
4511 tgl 350 GIC 8901539 : Assert(norderbys == 0);
9345 bruce 351 ECB :
352 : /* get the scan */
4511 tgl 353 GIC 8901539 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
4511 tgl 354 ECB :
355 : /* allocate private workspace */
4511 tgl 356 GIC 8901539 : so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
2937 kgrittn 357 CBC 8901539 : BTScanPosInvalidate(so->currPos);
358 8901539 : BTScanPosInvalidate(so->markPos);
4511 tgl 359 8901539 : if (scan->numberOfKeys > 0)
360 8896655 : so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
4511 tgl 361 ECB : else
4511 tgl 362 GIC 4884 : so->keyData = NULL;
4193 tgl 363 ECB :
4193 tgl 364 GIC 8901539 : so->arrayKeyData = NULL; /* assume no array keys for now */
4193 tgl 365 CBC 8901539 : so->numArrayKeys = 0;
366 8901539 : so->arrayKeys = NULL;
367 8901539 : so->arrayContext = NULL;
4193 tgl 368 ECB :
4511 tgl 369 GIC 8901539 : so->killedItems = NULL; /* until needed */
4511 tgl 370 CBC 8901539 : so->numKilled = 0;
4200 tgl 371 ECB :
372 : /*
373 : * We don't know yet whether the scan will be index-only, so we do not
374 : * allocate the tuple workspace arrays until btrescan. However, we set up
375 : * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
376 : */
4200 tgl 377 GIC 8901539 : so->currTuples = so->markTuples = NULL;
4200 tgl 378 ECB :
4193 tgl 379 GIC 8901539 : scan->xs_itupdesc = RelationGetDescr(rel);
4193 tgl 380 ECB :
4511 tgl 381 GIC 8901539 : scan->opaque = so;
9345 bruce 382 ECB :
2639 tgl 383 GIC 8901539 : return scan;
9770 scrappy 384 ECB : }
385 :
386 : /*
387 : * btrescan() -- rescan an index relation
388 : */
389 : void
2639 tgl 390 GIC 9087654 : btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
2639 tgl 391 ECB : ScanKey orderbys, int norderbys)
392 : {
4511 tgl 393 GIC 9087654 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
8297 tgl 394 ECB :
395 : /* we aren't holding any read locks, but gotta drop the pins */
6181 tgl 396 GIC 9087654 : if (BTScanPosIsValid(so->currPos))
9345 bruce 397 ECB : {
398 : /* Before leaving current page, deal with any killed items */
6181 tgl 399 GIC 24995 : if (so->numKilled > 0)
2937 kgrittn 400 CBC 152 : _bt_killitems(scan);
401 24995 : BTScanPosUnpinIfPinned(so->currPos);
402 24995 : BTScanPosInvalidate(so->currPos);
9345 bruce 403 ECB : }
404 :
6072 tgl 405 GIC 9087654 : so->markItemIndex = -1;
2244 rhaas 406 CBC 9087654 : so->arrayKeyCount = 0;
2937 kgrittn 407 9087654 : BTScanPosUnpinIfPinned(so->markPos);
408 9087654 : BTScanPosInvalidate(so->markPos);
9345 bruce 409 ECB :
410 : /*
411 : * Allocate tuple workspace arrays, if needed for an index-only scan and
412 : * not already done in a previous rescan call. To save on palloc
413 : * overhead, both workspaces are allocated as one palloc block; only this
414 : * function and btendscan know that.
415 : *
416 : * NOTE: this data structure also makes it safe to return data from a
417 : * "name" column, even though btree name_ops uses an underlying storage
418 : * datatype of cstring. The risk there is that "name" is supposed to be
419 : * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
420 : * However, since we only return data out of tuples sitting in the
421 : * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
422 : * data out of the markTuples array --- running off the end of memory for
423 : * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats
424 : * adding special-case treatment for name_ops elsewhere.
425 : */
4200 tgl 426 GIC 9087654 : if (scan->xs_want_itup && so->currTuples == NULL)
4200 tgl 427 ECB : {
4200 tgl 428 GIC 50124 : so->currTuples = (char *) palloc(BLCKSZ * 2);
4200 tgl 429 CBC 50124 : so->markTuples = so->currTuples + BLCKSZ;
4200 tgl 430 ECB : }
431 :
432 : /*
433 : * Reset the scan keys
434 : */
7322 tgl 435 GIC 9087654 : if (scankey && scan->numberOfKeys > 0)
9345 bruce 436 CBC 9082744 : memmove(scan->keyData,
9345 bruce 437 ECB : scankey,
9345 bruce 438 GIC 9082744 : scan->numberOfKeys * sizeof(ScanKeyData));
7088 tgl 439 CBC 9087654 : so->numberOfKeys = 0; /* until _bt_preprocess_keys sets it */
9749 scrappy 440 ECB :
441 : /* If any keys are SK_SEARCHARRAY type, set up array-key info */
4193 tgl 442 GIC 9087654 : _bt_preprocess_array_keys(scan);
9770 scrappy 443 CBC 9087654 : }
9770 scrappy 444 ECB :
445 : /*
446 : * btendscan() -- close down a scan
447 : */
448 : void
2639 tgl 449 GIC 8900856 : btendscan(IndexScanDesc scan)
9770 scrappy 450 ECB : {
6181 tgl 451 GIC 8900856 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
9345 bruce 452 ECB :
453 : /* we aren't holding any read locks, but gotta drop the pins */
6181 tgl 454 GIC 8900856 : if (BTScanPosIsValid(so->currPos))
9345 bruce 455 ECB : {
456 : /* Before leaving current page, deal with any killed items */
6181 tgl 457 GIC 3835001 : if (so->numKilled > 0)
2937 kgrittn 458 CBC 78401 : _bt_killitems(scan);
459 3835001 : BTScanPosUnpinIfPinned(so->currPos);
9345 bruce 460 ECB : }
461 :
6072 tgl 462 GIC 8900856 : so->markItemIndex = -1;
2937 kgrittn 463 CBC 8900856 : BTScanPosUnpinIfPinned(so->markPos);
2937 kgrittn 464 ECB :
465 : /* No need to invalidate positions, the RAM is about to be freed. */
466 :
467 : /* Release storage */
7032 neilc 468 GIC 8900856 : if (so->keyData != NULL)
9345 bruce 469 CBC 8896031 : pfree(so->keyData);
4193 tgl 470 ECB : /* so->arrayKeyData and so->arrayKeys are in arrayContext */
4193 tgl 471 GIC 8900856 : if (so->arrayContext != NULL)
4193 tgl 472 CBC 314 : MemoryContextDelete(so->arrayContext);
473 8900856 : if (so->killedItems != NULL)
474 117406 : pfree(so->killedItems);
4200 475 8900856 : if (so->currTuples != NULL)
476 50058 : pfree(so->currTuples);
4200 tgl 477 ECB : /* so->markTuples should not be pfree'd, see btrescan */
9345 bruce 478 GIC 8900856 : pfree(so);
9770 scrappy 479 CBC 8900856 : }
9770 scrappy 480 ECB :
481 : /*
482 : * btmarkpos() -- save current scan position
483 : */
484 : void
2639 tgl 485 GIC 65029 : btmarkpos(IndexScanDesc scan)
9770 scrappy 486 ECB : {
6181 tgl 487 GIC 65029 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
9345 bruce 488 ECB :
489 : /* There may be an old mark with a pin (but no lock). */
2937 kgrittn 490 GIC 65029 : BTScanPosUnpinIfPinned(so->markPos);
9345 bruce 491 ECB :
492 : /*
493 : * Just record the current itemIndex. If we later step to next page
494 : * before releasing the marked position, _bt_steppage makes a full copy of
495 : * the currPos struct in markPos. If (as often happens) the mark is moved
496 : * before we leave the page, we don't have to do that work.
497 : */
6181 tgl 498 GIC 65029 : if (BTScanPosIsValid(so->currPos))
6072 tgl 499 CBC 65029 : so->markItemIndex = so->currPos.itemIndex;
6072 tgl 500 ECB : else
501 : {
2937 kgrittn 502 UIC 0 : BTScanPosInvalidate(so->markPos);
6072 tgl 503 UBC 0 : so->markItemIndex = -1;
2937 kgrittn 504 EUB : }
505 :
506 : /* Also record the current positions of any array keys */
3846 tgl 507 GIC 65029 : if (so->numArrayKeys)
3846 tgl 508 CBC 3 : _bt_mark_array_keys(scan);
9770 scrappy 509 65029 : }
9770 scrappy 510 ECB :
511 : /*
512 : * btrestrpos() -- restore scan to last saved position
513 : */
514 : void
2639 tgl 515 GIC 27015 : btrestrpos(IndexScanDesc scan)
9770 scrappy 516 ECB : {
6181 tgl 517 GIC 27015 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
9345 bruce 518 ECB :
519 : /* Restore the marked positions of any array keys */
3846 tgl 520 GIC 27015 : if (so->numArrayKeys)
3846 tgl 521 CBC 3 : _bt_restore_array_keys(scan);
3846 tgl 522 ECB :
6072 tgl 523 GIC 27015 : if (so->markItemIndex >= 0)
9345 bruce 524 ECB : {
525 : /*
526 : * The scan has never moved to a new page since the last mark. Just
527 : * restore the itemIndex.
528 : *
529 : * NB: In this case we can't count on anything in so->markPos to be
530 : * accurate.
531 : */
6072 tgl 532 GIC 26961 : so->currPos.itemIndex = so->markItemIndex;
6031 bruce 533 ECB : }
534 : else
535 : {
536 : /*
537 : * The scan moved to a new page after last mark or restore, and we are
538 : * now restoring to the marked page. We aren't holding any read
539 : * locks, but if we're still holding the pin for the current position,
540 : * we must drop it.
541 : */
6072 tgl 542 GIC 54 : if (BTScanPosIsValid(so->currPos))
6072 tgl 543 ECB : {
544 : /* Before leaving current page, deal with any killed items */
2937 kgrittn 545 GIC 54 : if (so->numKilled > 0)
2937 kgrittn 546 LBC 0 : _bt_killitems(scan);
2937 kgrittn 547 GBC 54 : BTScanPosUnpinIfPinned(so->currPos);
6072 tgl 548 ECB : }
549 :
6072 tgl 550 GIC 54 : if (BTScanPosIsValid(so->markPos))
6072 tgl 551 ECB : {
552 : /* bump pin on mark buffer for assignment to current buffer */
2937 kgrittn 553 GIC 54 : if (BTScanPosIsPinned(so->markPos))
2937 kgrittn 554 LBC 0 : IncrBufferRefCount(so->markPos.buf);
6072 tgl 555 GBC 54 : memcpy(&so->currPos, &so->markPos,
6072 tgl 556 ECB : offsetof(BTScanPosData, items[1]) +
6072 tgl 557 GIC 54 : so->markPos.lastItem * sizeof(BTScanPosItem));
4200 tgl 558 CBC 54 : if (so->currTuples)
4200 tgl 559 LBC 0 : memcpy(so->currTuples, so->markTuples,
4200 tgl 560 UBC 0 : so->markPos.nextTupleOffset);
6072 tgl 561 EUB : }
562 : else
2937 kgrittn 563 UIC 0 : BTScanPosInvalidate(so->currPos);
9345 bruce 564 EUB : }
9770 scrappy 565 GIC 27015 : }
9770 scrappy 566 ECB :
567 : /*
568 : * btestimateparallelscan -- estimate storage for BTParallelScanDescData
569 : */
570 : Size
2244 rhaas 571 GIC 26 : btestimateparallelscan(void)
2244 rhaas 572 ECB : {
2244 rhaas 573 GIC 26 : return sizeof(BTParallelScanDescData);
2244 rhaas 574 ECB : }
575 :
576 : /*
577 : * btinitparallelscan -- initialize BTParallelScanDesc for parallel btree scan
578 : */
579 : void
2244 rhaas 580 GIC 26 : btinitparallelscan(void *target)
2244 rhaas 581 ECB : {
2244 rhaas 582 GIC 26 : BTParallelScanDesc bt_target = (BTParallelScanDesc) target;
2244 rhaas 583 ECB :
2244 rhaas 584 GIC 26 : SpinLockInit(&bt_target->btps_mutex);
2244 rhaas 585 CBC 26 : bt_target->btps_scanPage = InvalidBlockNumber;
586 26 : bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
587 26 : bt_target->btps_arrayKeyCount = 0;
588 26 : ConditionVariableInit(&bt_target->btps_cv);
589 26 : }
2244 rhaas 590 ECB :
591 : /*
592 : * btparallelrescan() -- reset parallel scan
593 : */
594 : void
2244 rhaas 595 GIC 12 : btparallelrescan(IndexScanDesc scan)
2244 rhaas 596 ECB : {
597 : BTParallelScanDesc btscan;
2244 rhaas 598 GIC 12 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
2244 rhaas 599 ECB :
2244 rhaas 600 GIC 12 : Assert(parallel_scan);
2244 rhaas 601 ECB :
2244 rhaas 602 GIC 12 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
2244 rhaas 603 ECB : parallel_scan->ps_offset);
604 :
605 : /*
606 : * In theory, we don't need to acquire the spinlock here, because there
607 : * shouldn't be any other workers running at this point, but we do so for
608 : * consistency.
609 : */
2244 rhaas 610 GIC 12 : SpinLockAcquire(&btscan->btps_mutex);
2244 rhaas 611 CBC 12 : btscan->btps_scanPage = InvalidBlockNumber;
612 12 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
613 12 : btscan->btps_arrayKeyCount = 0;
614 12 : SpinLockRelease(&btscan->btps_mutex);
615 12 : }
2244 rhaas 616 ECB :
617 : /*
618 : * _bt_parallel_seize() -- Begin the process of advancing the scan to a new
619 : * page. Other scans must wait until we call _bt_parallel_release()
620 : * or _bt_parallel_done().
621 : *
622 : * The return value is true if we successfully seized the scan and false
623 : * if we did not. The latter case occurs if no pages remain for the current
624 : * set of scankeys.
625 : *
626 : * If the return value is true, *pageno returns the next or current page
627 : * of the scan (depending on the scan direction). An invalid block number
628 : * means the scan hasn't yet started, and P_NONE means we've reached the end.
629 : * The first time a participating process reaches the last page, it will return
630 : * true and set *pageno to P_NONE; after that, further attempts to seize the
631 : * scan will return false.
632 : *
633 : * Callers should ignore the value of pageno if the return value is false.
634 : */
635 : bool
2244 rhaas 636 GIC 828 : _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
2244 rhaas 637 ECB : {
2244 rhaas 638 GIC 828 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2244 rhaas 639 ECB : BTPS_State pageStatus;
2244 rhaas 640 GIC 828 : bool exit_loop = false;
2244 rhaas 641 CBC 828 : bool status = true;
642 828 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
2244 rhaas 643 ECB : BTParallelScanDesc btscan;
644 :
2244 rhaas 645 GIC 828 : *pageno = P_NONE;
2244 rhaas 646 ECB :
2244 rhaas 647 GIC 828 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
2244 rhaas 648 ECB : parallel_scan->ps_offset);
649 :
650 : while (1)
651 : {
2244 rhaas 652 GIC 838 : SpinLockAcquire(&btscan->btps_mutex);
2244 rhaas 653 CBC 838 : pageStatus = btscan->btps_pageStatus;
2244 rhaas 654 ECB :
2244 rhaas 655 GIC 838 : if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
2244 rhaas 656 ECB : {
657 : /* Parallel scan has already advanced to a new set of scankeys. */
2244 rhaas 658 UIC 0 : status = false;
2244 rhaas 659 EUB : }
2244 rhaas 660 GIC 838 : else if (pageStatus == BTPARALLEL_DONE)
2244 rhaas 661 ECB : {
662 : /*
663 : * We're done with this set of scankeys. This may be the end, or
664 : * there could be more sets to try.
665 : */
2244 rhaas 666 GIC 146 : status = false;
2244 rhaas 667 ECB : }
2244 rhaas 668 GIC 692 : else if (pageStatus != BTPARALLEL_ADVANCING)
2244 rhaas 669 ECB : {
670 : /*
671 : * We have successfully seized control of the scan for the purpose
672 : * of advancing it to a new page!
673 : */
2244 rhaas 674 GIC 682 : btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
2244 rhaas 675 CBC 682 : *pageno = btscan->btps_scanPage;
676 682 : exit_loop = true;
2244 rhaas 677 ECB : }
2244 rhaas 678 GIC 838 : SpinLockRelease(&btscan->btps_mutex);
2244 rhaas 679 CBC 838 : if (exit_loop || !status)
2244 rhaas 680 ECB : break;
2244 rhaas 681 GIC 10 : ConditionVariableSleep(&btscan->btps_cv, WAIT_EVENT_BTREE_PAGE);
2244 rhaas 682 ECB : }
2244 rhaas 683 GIC 828 : ConditionVariableCancelSleep();
2244 rhaas 684 ECB :
2244 rhaas 685 GIC 828 : return status;
2244 rhaas 686 ECB : }
687 :
688 : /*
689 : * _bt_parallel_release() -- Complete the process of advancing the scan to a
690 : * new page. We now have the new value btps_scanPage; some other backend
691 : * can now begin advancing the scan.
692 : */
693 : void
2244 rhaas 694 GIC 644 : _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
2244 rhaas 695 ECB : {
2244 rhaas 696 GIC 644 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
2244 rhaas 697 ECB : BTParallelScanDesc btscan;
698 :
2244 rhaas 699 GIC 644 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
2244 rhaas 700 ECB : parallel_scan->ps_offset);
701 :
2244 rhaas 702 GIC 644 : SpinLockAcquire(&btscan->btps_mutex);
2244 rhaas 703 CBC 644 : btscan->btps_scanPage = scan_page;
704 644 : btscan->btps_pageStatus = BTPARALLEL_IDLE;
705 644 : SpinLockRelease(&btscan->btps_mutex);
706 644 : ConditionVariableSignal(&btscan->btps_cv);
707 644 : }
2244 rhaas 708 ECB :
709 : /*
710 : * _bt_parallel_done() -- Mark the parallel scan as complete.
711 : *
712 : * When there are no pages left to scan, this function should be called to
713 : * notify other workers. Otherwise, they might wait forever for the scan to
714 : * advance to the next page.
715 : */
716 : void
2244 rhaas 717 GIC 5227696 : _bt_parallel_done(IndexScanDesc scan)
2244 rhaas 718 ECB : {
2244 rhaas 719 GIC 5227696 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2244 rhaas 720 CBC 5227696 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
2244 rhaas 721 ECB : BTParallelScanDesc btscan;
2244 rhaas 722 GIC 5227696 : bool status_changed = false;
2244 rhaas 723 ECB :
724 : /* Do nothing, for non-parallel scans */
2244 rhaas 725 GIC 5227696 : if (parallel_scan == NULL)
2244 rhaas 726 CBC 5227658 : return;
2244 rhaas 727 ECB :
2244 rhaas 728 GIC 38 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
2244 rhaas 729 ECB : parallel_scan->ps_offset);
730 :
731 : /*
732 : * Mark the parallel scan as done for this combination of scan keys,
733 : * unless some other process already did so. See also
734 : * _bt_advance_array_keys.
735 : */
2244 rhaas 736 GIC 38 : SpinLockAcquire(&btscan->btps_mutex);
2244 rhaas 737 CBC 38 : if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
738 38 : btscan->btps_pageStatus != BTPARALLEL_DONE)
2244 rhaas 739 ECB : {
2244 rhaas 740 GIC 38 : btscan->btps_pageStatus = BTPARALLEL_DONE;
2244 rhaas 741 CBC 38 : status_changed = true;
2244 rhaas 742 ECB : }
2244 rhaas 743 GIC 38 : SpinLockRelease(&btscan->btps_mutex);
2244 rhaas 744 ECB :
745 : /* wake up all the workers associated with this parallel scan */
2244 rhaas 746 GIC 38 : if (status_changed)
2244 rhaas 747 CBC 38 : ConditionVariableBroadcast(&btscan->btps_cv);
2244 rhaas 748 ECB : }
749 :
750 : /*
751 : * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
752 : * keys.
753 : *
754 : * Updates the count of array keys processed for both local and parallel
755 : * scans.
756 : */
757 : void
2244 rhaas 758 UIC 0 : _bt_parallel_advance_array_keys(IndexScanDesc scan)
2244 rhaas 759 EUB : {
2244 rhaas 760 UIC 0 : BTScanOpaque so = (BTScanOpaque) scan->opaque;
2244 rhaas 761 UBC 0 : ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
2244 rhaas 762 EUB : BTParallelScanDesc btscan;
763 :
2244 rhaas 764 UIC 0 : btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
2244 rhaas 765 EUB : parallel_scan->ps_offset);
766 :
2244 rhaas 767 UIC 0 : so->arrayKeyCount++;
2244 rhaas 768 UBC 0 : SpinLockAcquire(&btscan->btps_mutex);
769 0 : if (btscan->btps_pageStatus == BTPARALLEL_DONE)
2244 rhaas 770 EUB : {
2244 rhaas 771 UIC 0 : btscan->btps_scanPage = InvalidBlockNumber;
2244 rhaas 772 UBC 0 : btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
773 0 : btscan->btps_arrayKeyCount++;
2244 rhaas 774 EUB : }
2244 rhaas 775 UIC 0 : SpinLockRelease(&btscan->btps_mutex);
2244 rhaas 776 UBC 0 : }
2244 rhaas 777 EUB :
778 : /*
779 : * Bulk deletion of all index entries pointing to a set of heap tuples.
780 : * The set of target tuples is specified via a callback routine that tells
781 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
782 : *
783 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
784 : */
785 : IndexBulkDeleteResult *
2639 tgl 786 GIC 4011 : btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
2639 tgl 787 ECB : IndexBulkDeleteCallback callback, void *callback_state)
788 : {
6186 tgl 789 GIC 4011 : Relation rel = info->index;
6180 tgl 790 ECB : BTCycleId cycleid;
791 :
792 : /* allocate stats if first time through, else re-use existing struct */
6180 tgl 793 GIC 4011 : if (stats == NULL)
6180 tgl 794 CBC 4011 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
6266 tgl 795 ECB :
796 : /* Establish the vacuum cycle ID to use for this scan */
797 : /* The ENSURE stuff ensures we clean up shared memory on failure */
5471 tgl 798 GIC 4011 : PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
7938 tgl 799 ECB : {
6180 tgl 800 GIC 4011 : cycleid = _bt_start_vacuum(rel);
7188 bruce 801 ECB :
1073 pg 802 GIC 4011 : btvacuumscan(info, stats, callback, callback_state, cycleid);
6180 tgl 803 ECB : }
5471 tgl 804 GIC 4011 : PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
5471 tgl 805 CBC 4011 : _bt_end_vacuum(rel);
7938 tgl 806 ECB :
2639 tgl 807 GIC 4011 : return stats;
9770 scrappy 808 ECB : }
809 :
810 : /*
811 : * Post-VACUUM cleanup.
812 : *
813 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
814 : */
815 : IndexBulkDeleteResult *
2639 tgl 816 GIC 94456 : btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
7351 tgl 817 ECB : {
818 : BlockNumber num_delpages;
819 :
820 : /* No-op in ANALYZE ONLY mode */
5129 tgl 821 GIC 94456 : if (info->analyze_only)
2639 tgl 822 CBC 39995 : return stats;
5129 tgl 823 ECB :
824 : /*
825 : * If btbulkdelete was called, we need not do anything (we just maintain
826 : * the information used within _bt_vacuum_needs_cleanup() by calling
827 : * _bt_set_cleanup_info() below).
828 : *
829 : * If btbulkdelete was _not_ called, then we have a choice to make: we
830 : * must decide whether or not a btvacuumscan() call is needed now (i.e.
831 : * whether the ongoing VACUUM operation can entirely avoid a physical scan
832 : * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
833 : * now.
834 : */
6186 tgl 835 GIC 54461 : if (stats == NULL)
6180 tgl 836 ECB : {
837 : /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
8 andres 838 GNC 50713 : if (!_bt_vacuum_needs_cleanup(info->index, info->heaprel))
1831 teodor 839 CBC 50708 : return NULL;
1831 teodor 840 ECB :
841 : /*
842 : * Since we aren't going to actually delete any leaf items, there's no
843 : * need to go through all the vacuum-cycle-ID pushups here.
844 : *
845 : * Posting list tuples are a source of inaccuracy for cleanup-only
846 : * scans. btvacuumscan() will assume that the number of index tuples
847 : * from each page can be used as num_index_tuples, even though
848 : * num_index_tuples is supposed to represent the number of TIDs in the
849 : * index. This naive approach can underestimate the number of tuples
850 : * in the index significantly.
851 : *
852 : * We handle the problem by making num_index_tuples an estimate in
853 : * cleanup-only case.
854 : */
6186 tgl 855 GIC 5 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
1073 pg 856 CBC 5 : btvacuumscan(info, stats, NULL, NULL, 0);
760 857 5 : stats->estimated_count = true;
6180 tgl 858 ECB : }
859 :
860 : /*
861 : * Maintain num_delpages value in metapage for _bt_vacuum_needs_cleanup().
862 : *
863 : * num_delpages is the number of deleted pages now in the index that were
864 : * not safe to place in the FSM to be recycled just yet. num_delpages is
865 : * greater than 0 only when _bt_pagedel() actually deleted pages during
866 : * our call to btvacuumscan(). Even then, _bt_pendingfsm_finalize() must
867 : * have failed to place any newly deleted pages in the FSM just moments
868 : * ago. (Actually, there are edge cases where recycling of the current
869 : * VACUUM's newly deleted pages does not even become safe by the time the
870 : * next VACUUM comes around. See nbtree/README.)
871 : */
774 pg 872 GIC 3753 : Assert(stats->pages_deleted >= stats->pages_free);
774 pg 873 CBC 3753 : num_delpages = stats->pages_deleted - stats->pages_free;
8 andres 874 GNC 3753 : _bt_set_cleanup_info(info->index, info->heaprel, num_delpages);
774 pg 875 ECB :
876 : /*
877 : * It's quite possible for us to be fooled by concurrent page splits into
878 : * double-counting some index tuples, so disbelieve any total that exceeds
879 : * the underlying heap's count ... if we know that accurately. Otherwise
880 : * this might just make matters worse.
881 : */
4808 tgl 882 GIC 3753 : if (!info->estimated_count)
6180 tgl 883 ECB : {
6180 tgl 884 GIC 3731 : if (stats->num_index_tuples > info->num_heap_tuples)
6180 tgl 885 CBC 7 : stats->num_index_tuples = info->num_heap_tuples;
6180 tgl 886 ECB : }
887 :
2639 tgl 888 GIC 3753 : return stats;
6180 tgl 889 ECB : }
890 :
891 : /*
892 : * btvacuumscan --- scan the index for VACUUMing purposes
893 : *
894 : * This combines the functions of looking for leaf tuples that are deletable
895 : * according to the vacuum callback, looking for empty pages that can be
896 : * deleted, and looking for old deleted pages that can be recycled. Both
897 : * btbulkdelete and btvacuumcleanup invoke this (the latter only if no
898 : * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
899 : *
900 : * The caller is responsible for initially allocating/zeroing a stats struct
901 : * and for obtaining a vacuum cycle ID if necessary.
902 : */
903 : static void
6180 tgl 904 GIC 4016 : btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
6180 tgl 905 ECB : IndexBulkDeleteCallback callback, void *callback_state,
906 : BTCycleId cycleid)
907 : {
6180 tgl 908 GIC 4016 : Relation rel = info->index;
6180 tgl 909 ECB : BTVacState vstate;
910 : BlockNumber num_pages;
911 : BlockNumber scanblkno;
912 : bool needLock;
913 :
914 : /*
915 : * Reset fields that track information about the entire index now. This
916 : * avoids double-counting in the case where a single VACUUM command
917 : * requires multiple scans of the index.
918 : *
919 : * Avoid resetting the tuples_removed and pages_newly_deleted fields here,
920 : * since they track information about the VACUUM command, and so must last
921 : * across each call to btvacuumscan().
922 : *
923 : * (Note that pages_free is treated as state about the whole index, not
924 : * the current VACUUM. This is appropriate because RecordFreeIndexPage()
925 : * calls are idempotent, and get repeated for the same deleted pages in
926 : * some scenarios. The point for us is to track the number of recyclable
927 : * pages in the index at the end of the VACUUM command.)
928 : */
774 pg 929 GIC 4016 : stats->num_pages = 0;
6180 tgl 930 CBC 4016 : stats->num_index_tuples = 0;
931 4016 : stats->pages_deleted = 0;
774 pg 932 4016 : stats->pages_free = 0;
6180 tgl 933 ECB :
934 : /* Set up info to pass down to btvacuumpage */
6180 tgl 935 GIC 4016 : vstate.info = info;
6180 tgl 936 CBC 4016 : vstate.stats = stats;
937 4016 : vstate.callback = callback;
938 4016 : vstate.callback_state = callback_state;
939 4016 : vstate.cycleid = cycleid;
7351 tgl 940 ECB :
941 : /* Create a temporary memory context to run _bt_pagedel in */
6180 tgl 942 GIC 4016 : vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
6180 tgl 943 ECB : "_bt_pagedel",
944 : ALLOCSET_DEFAULT_SIZES);
945 :
946 : /* Initialize vstate fields used by _bt_pendingfsm_finalize */
749 pg 947 GIC 4016 : vstate.bufsize = 0;
749 pg 948 CBC 4016 : vstate.maxbufsize = 0;
949 4016 : vstate.pendingpages = NULL;
950 4016 : vstate.npendingpages = 0;
749 pg 951 ECB : /* Consider applying _bt_pendingfsm_finalize optimization */
749 pg 952 GIC 4016 : _bt_pendingfsm_init(rel, &vstate, (callback == NULL));
749 pg 953 ECB :
954 : /*
955 : * The outer loop iterates over all index pages except the metapage, in
956 : * physical order (we hope the kernel will cooperate in providing
957 : * read-ahead for speed). It is critical that we visit all leaf pages,
958 : * including ones added after we start the scan, else we might fail to
959 : * delete some deletable tuples. Hence, we must repeatedly check the
960 : * relation length. We must acquire the relation-extension lock while
961 : * doing so to avoid a race condition: if someone else is extending the
962 : * relation, there is a window where bufmgr/smgr have created a new
963 : * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
964 : * we manage to scan such a page here, we'll improperly assume it can be
965 : * recycled. Taking the lock synchronizes things enough to prevent a
966 : * problem: either num_pages won't include the new page, or _bt_getbuf
967 : * already has write lock on the buffer and it will be fully initialized
968 : * before we can examine it. Also, we need not worry if a page is added
969 : * immediately after we look; the page splitting code already has
970 : * write-lock on the left page before it adds a right page, so we must
971 : * already have processed any tuples due to be moved into such a page.
972 : *
973 : * XXX: Now that new pages are locked with RBM_ZERO_AND_LOCK, I don't
974 : * think the use of the extension lock is still required.
975 : *
976 : * We can skip locking for new or temp relations, however, since no one
977 : * else could be accessing them.
978 : */
6180 tgl 979 GIC 4016 : needLock = !RELATION_IS_LOCAL(rel);
980 :
1072 pg 981 4016 : scanblkno = BTREE_METAPAGE + 1;
982 : for (;;)
6180 tgl 983 ECB : {
984 : /* Get the current relation length */
6180 tgl 985 CBC 7935 : if (needLock)
6180 tgl 986 GIC 7933 : LockRelationForExtension(rel, ExclusiveLock);
987 7935 : num_pages = RelationGetNumberOfBlocks(rel);
988 7935 : if (needLock)
6180 tgl 989 CBC 7933 : UnlockRelationForExtension(rel, ExclusiveLock);
6180 tgl 990 ECB :
1468 alvherre 991 CBC 7935 : if (info->report_progress)
992 429 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1468 alvherre 993 ECB : num_pages);
994 :
6180 tgl 995 : /* Quit if we've scanned the whole relation */
1072 pg 996 CBC 7935 : if (scanblkno >= num_pages)
6180 tgl 997 GIC 4016 : break;
998 : /* Iterate over pages, then loop back to recheck length */
1072 pg 999 36840 : for (; scanblkno < num_pages; scanblkno++)
6265 tgl 1000 ECB : {
1072 pg 1001 CBC 32921 : btvacuumpage(&vstate, scanblkno);
1468 alvherre 1002 GIC 32921 : if (info->report_progress)
1468 alvherre 1003 CBC 211 : pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1004 : scanblkno);
6265 tgl 1005 ECB : }
7351 1006 : }
1007 :
1008 : /* Set statistics num_pages field to final size of index */
774 pg 1009 GIC 4016 : stats->num_pages = num_pages;
1010 :
6180 tgl 1011 4016 : MemoryContextDelete(vstate.pagedelcontext);
1012 :
1836 tgl 1013 ECB : /*
1014 : * If there were any calls to _bt_pagedel() during scan of the index then
749 pg 1015 : * see if any of the resulting pages can be placed in the FSM now. When
1016 : * it's not safe we'll have to leave it up to a future VACUUM operation.
1017 : *
1018 : * Finally, if we placed any pages in the FSM (either just now or during
1019 : * the scan), forcibly update the upper-level FSM pages to ensure that
1020 : * searchers can find them.
1021 : */
749 pg 1022 GIC 4016 : _bt_pendingfsm_finalize(rel, &vstate);
774 1023 4016 : if (stats->pages_free > 0)
1836 tgl 1024 18 : IndexFreeSpaceMapVacuum(rel);
6180 1025 4016 : }
7351 tgl 1026 ECB :
6180 1027 : /*
1028 : * btvacuumpage --- VACUUM one page
1029 : *
1030 : * This processes a single page for btvacuumscan(). In some cases we must
1031 : * backtrack to re-examine and VACUUM pages that were the scanblkno during
1032 : * a previous call here. This is how we handle page splits (that happened
1033 : * after our cycleid was acquired) whose right half page happened to reuse
1034 : * a block that we might have processed at some point before it was
1035 : * recycled (i.e. before the page split).
1036 : */
1037 : static void
1072 pg 1038 GIC 32921 : btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
1039 : {
6180 tgl 1040 32921 : IndexVacuumInfo *info = vstate->info;
1041 32921 : IndexBulkDeleteResult *stats = vstate->stats;
6180 tgl 1042 CBC 32921 : IndexBulkDeleteCallback callback = vstate->callback;
6180 tgl 1043 GIC 32921 : void *callback_state = vstate->callback_state;
6180 tgl 1044 CBC 32921 : Relation rel = info->index;
6 pg 1045 GNC 32921 : Relation heaprel = info->heaprel;
1072 pg 1046 ECB : bool attempt_pagedel;
1060 tgl 1047 : BlockNumber blkno,
1048 : backtrack_to;
6180 1049 : Buffer buf;
1050 : Page page;
1051 : BTPageOpaque opaque;
1052 :
1072 pg 1053 GIC 32921 : blkno = scanblkno;
1054 :
1055 32921 : backtrack:
1056 :
1057 32921 : attempt_pagedel = false;
1072 pg 1058 CBC 32921 : backtrack_to = P_NONE;
1059 :
6180 tgl 1060 ECB : /* call vacuum_delay_point while not holding any buffer lock */
6180 tgl 1061 GIC 32921 : vacuum_delay_point();
6180 tgl 1062 ECB :
1063 : /*
1064 : * We can't use _bt_getbuf() here because it always applies
1065 : * _bt_checkpage(), which will barf on an all-zero page. We want to
5793 1066 : * recycle all-zero pages, not fail. Also, we want to use a nondefault
1067 : * buffer access strategy.
1068 : */
5273 heikki.linnakangas 1069 GIC 32921 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
1070 : info->strategy);
992 pg 1071 32921 : _bt_lockbuf(rel, buf, BT_READ);
2545 kgrittn 1072 32921 : page = BufferGetPage(buf);
1072 pg 1073 32921 : opaque = NULL;
6180 tgl 1074 CBC 32921 : if (!PageIsNew(page))
1075 : {
1076 32921 : _bt_checkpage(rel, buf);
373 michael 1077 32921 : opaque = BTPageGetOpaque(page);
2813 heikki.linnakangas 1078 ECB : }
6180 tgl 1079 :
1072 pg 1080 GIC 32921 : Assert(blkno <= scanblkno);
1072 pg 1081 CBC 32921 : if (blkno != scanblkno)
6180 tgl 1082 ECB : {
1083 : /*
1084 : * We're backtracking.
1072 pg 1085 : *
1086 : * We followed a right link to a sibling leaf page (a page that
1087 : * happens to be from a block located before scanblkno). The only
1088 : * case we want to do anything with is a live leaf page having the
1089 : * current vacuum cycle ID.
1090 : *
1091 : * The page had better be in a state that's consistent with what we
1092 : * expect. Check for conditions that imply corruption in passing. It
1093 : * can't be half-dead because only an interrupted VACUUM process can
1094 : * leave pages in that state, so we'd definitely have dealt with it
1095 : * back when the page was the scanblkno page (half-dead pages are
1096 : * always marked fully deleted by _bt_pagedel()). This assumes that
1097 : * there can be only one vacuum process running at a time.
1098 : */
1072 pg 1099 UIC 0 : if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
1100 : {
1101 0 : Assert(false);
1102 : ereport(LOG,
1103 : (errcode(ERRCODE_INDEX_CORRUPTED),
1072 pg 1104 EUB : errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
1105 : blkno, scanblkno, RelationGetRelationName(rel))));
1106 : _bt_relbuf(rel, buf);
1107 : return;
1108 : }
1109 :
1110 : /*
1111 : * We may have already processed the page in an earlier call, when the
1112 : * page was scanblkno. This happens when the leaf page split occurred
1113 : * after the scan began, but before the right sibling page became the
1114 : * scanblkno.
1115 : *
1116 : * Page may also have been deleted by current btvacuumpage() call,
1117 : * since _bt_pagedel() sometimes deletes the right sibling page of
1118 : * scanblkno in passing (it does so after we decided where to
1119 : * backtrack to). We don't need to process this page as a deleted
1120 : * page a second time now (in fact, it would be wrong to count it as a
1121 : * deleted page in the bulk delete statistics a second time).
1122 : */
1072 pg 1123 UIC 0 : if (opaque->btpo_cycleid != vstate->cycleid || P_ISDELETED(opaque))
1124 : {
1125 : /* Done with current scanblkno (and all lower split pages) */
6180 tgl 1126 0 : _bt_relbuf(rel, buf);
1127 0 : return;
6180 tgl 1128 EUB : }
1129 : }
1130 :
6 pg 1131 GNC 32921 : if (!opaque || BTPageIsRecyclable(page, heaprel))
6180 tgl 1132 EUB : {
1133 : /* Okay to recycle this page (which could be leaf or internal) */
5304 heikki.linnakangas 1134 GIC 120 : RecordFreeIndexPage(rel, blkno);
6180 tgl 1135 120 : stats->pages_deleted++;
774 pg 1136 CBC 120 : stats->pages_free++;
1137 : }
6180 tgl 1138 GIC 32801 : else if (P_ISDELETED(opaque))
6180 tgl 1139 ECB : {
1073 pg 1140 : /*
1141 : * Already deleted page (which could be leaf or internal). Can't
1142 : * recycle yet.
1143 : */
6180 tgl 1144 GIC 97 : stats->pages_deleted++;
1145 : }
6003 1146 32704 : else if (P_ISHALFDEAD(opaque))
1147 : {
1148 : /* Half-dead leaf page (from interrupted VACUUM) -- finish deleting */
773 pg 1149 LBC 0 : attempt_pagedel = true;
1150 :
1073 pg 1151 ECB : /*
1152 : * _bt_pagedel() will increment both pages_newly_deleted and
1153 : * pages_deleted stats in all cases (barring corruption)
1073 pg 1154 EUB : */
1155 : }
6180 tgl 1156 GIC 32704 : else if (P_ISLEAF(opaque))
1157 : {
1158 : OffsetNumber deletable[MaxIndexTuplesPerPage];
1159 : int ndeletable;
1160 : BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1138 pg 1161 ECB : int nupdatable;
1162 : OffsetNumber offnum,
1163 : minoff,
1164 : maxoff;
1165 : int nhtidsdead,
1166 : nhtidslive;
1167 :
1168 : /*
1169 : * Trade in the initial read lock for a full cleanup lock on this
1170 : * page. We must get such a lock on every leaf page over the course
1171 : * of the vacuum scan, whether or not it actually contains any
1172 : * deletable tuples --- see nbtree/README.
1173 : */
992 pg 1174 GIC 30039 : _bt_upgradelockbufcleanup(rel, buf);
1175 :
1176 : /*
1177 : * Check whether we need to backtrack to earlier pages. What we are
1178 : * concerned about is a page split that happened since we started the
1072 pg 1179 ECB : * vacuum scan. If the split moved tuples on the right half of the
1180 : * split (i.e. the tuples that sort high) to a block that we already
1181 : * passed over, then we might have missed the tuples. We need to
1182 : * backtrack now. (Must do this before possibly clearing btpo_cycleid
1183 : * or deleting scanblkno page below!)
1184 : */
6180 tgl 1185 GIC 30039 : if (vstate->cycleid != 0 &&
1186 29981 : opaque->btpo_cycleid == vstate->cycleid &&
6180 tgl 1187 UIC 0 : !(opaque->btpo_flags & BTP_SPLIT_END) &&
1188 0 : !P_RIGHTMOST(opaque) &&
1072 pg 1189 0 : opaque->btpo_next < scanblkno)
1072 pg 1190 LBC 0 : backtrack_to = opaque->btpo_next;
6180 tgl 1191 ECB :
6180 tgl 1192 GBC 30039 : ndeletable = 0;
1138 pg 1193 30039 : nupdatable = 0;
6180 tgl 1194 30039 : minoff = P_FIRSTDATAKEY(opaque);
1195 30039 : maxoff = PageGetMaxOffsetNumber(page);
1138 pg 1196 GIC 30039 : nhtidsdead = 0;
1138 pg 1197 CBC 30039 : nhtidslive = 0;
6180 tgl 1198 30039 : if (callback)
6180 tgl 1199 ECB : {
560 pg 1200 : /* btbulkdelete callback tells us what to delete (or update) */
6180 tgl 1201 CBC 29981 : for (offnum = minoff;
1202 5948136 : offnum <= maxoff;
1203 5918155 : offnum = OffsetNumberNext(offnum))
1204 : {
1205 : IndexTuple itup;
6180 tgl 1206 ECB :
6180 tgl 1207 CBC 5918155 : itup = (IndexTuple) PageGetItem(page,
6180 tgl 1208 ECB : PageGetItemId(page, offnum));
1209 :
1138 pg 1210 GIC 5918155 : Assert(!BTreeTupleIsPivot(itup));
1211 5918155 : if (!BTreeTupleIsPosting(itup))
1138 pg 1212 ECB : {
1213 : /* Regular tuple, standard table TID representation */
1138 pg 1214 GIC 5713458 : if (callback(&itup->t_tid, callback_state))
1138 pg 1215 ECB : {
1138 pg 1216 CBC 712323 : deletable[ndeletable++] = offnum;
1138 pg 1217 GIC 712323 : nhtidsdead++;
1218 : }
1138 pg 1219 ECB : else
1138 pg 1220 GIC 5001135 : nhtidslive++;
1138 pg 1221 ECB : }
1222 : else
1223 : {
1224 : BTVacuumPosting vacposting;
1225 : int nremaining;
1226 :
1227 : /* Posting list tuple */
1138 pg 1228 GIC 204697 : vacposting = btreevacuumposting(vstate, itup, offnum,
1229 : &nremaining);
1230 204697 : if (vacposting == NULL)
1231 : {
1232 : /*
1138 pg 1233 ECB : * All table TIDs from the posting tuple remain, so no
1234 : * delete or update required
1235 : */
1138 pg 1236 GIC 118403 : Assert(nremaining == BTreeTupleGetNPosting(itup));
1237 : }
1238 86294 : else if (nremaining > 0)
1239 : {
1240 :
1138 pg 1241 ECB : /*
1242 : * Store metadata about posting list tuple in
1243 : * updatable array for entire page. Existing tuple
1244 : * will be updated during the later call to
1245 : * _bt_delitems_vacuum().
1246 : */
1138 pg 1247 GIC 74159 : Assert(nremaining < BTreeTupleGetNPosting(itup));
1248 74159 : updatable[nupdatable++] = vacposting;
1249 74159 : nhtidsdead += BTreeTupleGetNPosting(itup) - nremaining;
1250 : }
1251 : else
1138 pg 1252 ECB : {
1253 : /*
1254 : * All table TIDs from the posting list must be
1255 : * deleted. We'll delete the index tuple completely
1256 : * (no update required).
1257 : */
1138 pg 1258 GIC 12135 : Assert(nremaining == 0);
1259 12135 : deletable[ndeletable++] = offnum;
1260 12135 : nhtidsdead += BTreeTupleGetNPosting(itup);
1261 12135 : pfree(vacposting);
1262 : }
1138 pg 1263 ECB :
1138 pg 1264 CBC 204697 : nhtidslive += nremaining;
1138 pg 1265 ECB : }
6180 tgl 1266 : }
1267 : }
1268 :
1269 : /*
1270 : * Apply any needed deletes or updates. We issue just one
1271 : * _bt_delitems_vacuum() call per page, so as to minimize WAL traffic.
1272 : */
1138 pg 1273 GIC 30039 : if (ndeletable > 0 || nupdatable > 0)
1274 : {
1072 1275 16990 : Assert(nhtidsdead >= ndeletable + nupdatable);
1138 1276 16990 : _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable,
1277 : nupdatable);
4859 simon 1278 ECB :
1138 pg 1279 GIC 16990 : stats->tuples_removed += nhtidsdead;
6180 tgl 1280 ECB : /* must recompute maxoff */
6180 tgl 1281 CBC 16990 : maxoff = PageGetMaxOffsetNumber(page);
1282 :
1283 : /* can't leak memory here */
1138 pg 1284 91149 : for (int i = 0; i < nupdatable; i++)
1138 pg 1285 GIC 74159 : pfree(updatable[i]);
6180 tgl 1286 ECB : }
1287 : else
1288 : {
1289 : /*
1073 pg 1290 : * If the leaf page has been split during this vacuum cycle, it
1291 : * seems worth expending a write to clear btpo_cycleid even if we
1292 : * don't have any deletions to do. (If we do, _bt_delitems_vacuum
1293 : * takes care of this.) This ensures we won't process the page
1294 : * again.
1295 : *
1296 : * We treat this like a hint-bit update because there's no need to
1297 : * WAL-log it.
1298 : */
1138 pg 1299 GIC 13049 : Assert(nhtidsdead == 0);
6180 tgl 1300 13049 : if (vstate->cycleid != 0 &&
1301 12991 : opaque->btpo_cycleid == vstate->cycleid)
1302 : {
6180 tgl 1303 UIC 0 : opaque->btpo_cycleid = 0;
3583 jdavis 1304 LBC 0 : MarkBufferDirtyHint(buf, true);
6180 tgl 1305 ECB : }
1306 : }
1307 :
6180 tgl 1308 EUB : /*
1073 pg 1309 : * If the leaf page is now empty, try to delete it; else count the
1310 : * live tuples (live table TIDs in posting lists are counted as
1311 : * separate live tuples). We don't delete when backtracking, though,
1312 : * since that would require teaching _bt_pagedel() about backtracking
1313 : * (doesn't seem worth adding more complexity to deal with that).
1314 : *
1315 : * We don't count the number of live TIDs during cleanup-only calls to
1316 : * btvacuumscan (i.e. when callback is not set). We count the number
1317 : * of index tuples directly instead. This avoids the expense of
1318 : * directly examining all of the tuples on each page. VACUUM will
1319 : * treat num_index_tuples as an estimate in cleanup-only case, so it
1320 : * doesn't matter that this underestimates num_index_tuples
1321 : * significantly in some cases.
1322 : */
6180 tgl 1323 GIC 30039 : if (minoff > maxoff)
1072 pg 1324 2730 : attempt_pagedel = (blkno == scanblkno);
886 1325 27309 : else if (callback)
1138 1326 27255 : stats->num_index_tuples += nhtidslive;
1327 : else
886 pg 1328 CBC 54 : stats->num_index_tuples += maxoff - minoff + 1;
1138 pg 1329 ECB :
1072 pg 1330 CBC 30039 : Assert(!attempt_pagedel || nhtidslive == 0);
6180 tgl 1331 ECB : }
1332 :
1072 pg 1333 CBC 32921 : if (attempt_pagedel)
1334 : {
6180 tgl 1335 ECB : MemoryContext oldcontext;
1336 :
1337 : /* Run pagedel in a temp context to avoid memory leakage */
6180 tgl 1338 CBC 2730 : MemoryContextReset(vstate->pagedelcontext);
6180 tgl 1339 GIC 2730 : oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
1340 :
1341 : /*
1342 : * _bt_pagedel maintains the bulk delete stats on our behalf;
773 pg 1343 ECB : * pages_newly_deleted and pages_deleted are likely to be incremented
1344 : * during call
1345 : */
1072 pg 1346 GIC 2730 : Assert(blkno == scanblkno);
773 1347 2730 : _bt_pagedel(rel, buf, vstate);
1348 :
6180 tgl 1349 2730 : MemoryContextSwitchTo(oldcontext);
1350 : /* pagedel released buffer, so we shouldn't */
6180 tgl 1351 ECB : }
1352 : else
6180 tgl 1353 GIC 30191 : _bt_relbuf(rel, buf);
6180 tgl 1354 ECB :
1072 pg 1355 GIC 32921 : if (backtrack_to != P_NONE)
1356 : {
1072 pg 1357 UIC 0 : blkno = backtrack_to;
1072 pg 1358 LBC 0 : goto backtrack;
1359 : }
7351 tgl 1360 ECB : }
1361 :
1138 pg 1362 EUB : /*
1363 : * btreevacuumposting --- determine TIDs still needed in posting list
1364 : *
1365 : * Returns metadata describing how to build replacement tuple without the TIDs
1366 : * that VACUUM needs to delete. Returned value is NULL in the common case
1367 : * where no changes are needed to caller's posting list tuple (we avoid
1368 : * allocating memory here as an optimization).
1369 : *
1370 : * The number of TIDs that should remain in the posting list tuple is set for
1371 : * caller in *nremaining.
1372 : */
1373 : static BTVacuumPosting
1138 pg 1374 GIC 204697 : btreevacuumposting(BTVacState *vstate, IndexTuple posting,
1375 : OffsetNumber updatedoffset, int *nremaining)
1376 : {
1377 204697 : int live = 0;
1378 204697 : int nitem = BTreeTupleGetNPosting(posting);
1138 pg 1379 CBC 204697 : ItemPointer items = BTreeTupleGetPosting(posting);
1138 pg 1380 GIC 204697 : BTVacuumPosting vacposting = NULL;
1381 :
1138 pg 1382 CBC 1120832 : for (int i = 0; i < nitem; i++)
1138 pg 1383 ECB : {
1138 pg 1384 CBC 916135 : if (!vstate->callback(items + i, vstate->callback_state))
1138 pg 1385 ECB : {
1386 : /* Live table TID */
1138 pg 1387 CBC 744283 : live++;
1388 : }
1389 171852 : else if (vacposting == NULL)
1390 : {
1391 : /*
1138 pg 1392 ECB : * First dead table TID encountered.
1393 : *
1394 : * It's now clear that we need to delete one or more dead table
1395 : * TIDs, so start maintaining metadata describing how to update
1396 : * existing posting list tuple.
1397 : */
1138 pg 1398 GIC 86294 : vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1399 : nitem * sizeof(uint16));
1400 :
1401 86294 : vacposting->itup = posting;
1402 86294 : vacposting->updatedoffset = updatedoffset;
1138 pg 1403 CBC 86294 : vacposting->ndeletedtids = 0;
1138 pg 1404 GIC 86294 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1405 : }
1138 pg 1406 ECB : else
1407 : {
1408 : /* Second or subsequent dead table TID */
1138 pg 1409 CBC 85558 : vacposting->deletetids[vacposting->ndeletedtids++] = i;
1410 : }
1411 : }
1412 :
1138 pg 1413 GIC 204697 : *nremaining = live;
1138 pg 1414 CBC 204697 : return vacposting;
1415 : }
1416 :
1417 : /*
4130 tgl 1418 ECB : * btcanreturn() -- Check whether btree indexes support index-only scans.
1419 : *
1420 : * btrees always do, so this is trivial.
1421 : */
1422 : bool
2639 tgl 1423 GIC 398723 : btcanreturn(Relation index, int attno)
1424 : {
1425 398723 : return true;
1426 : }
|