Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hash.c
4 : * Implementation of Margo Seltzer's Hashing package for postgres.
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hash.c
12 : *
13 : * NOTES
14 : * This file contains only the public interface routines.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 :
19 : #include "postgres.h"
20 :
21 : #include "access/hash.h"
22 : #include "access/hash_xlog.h"
23 : #include "access/relscan.h"
24 : #include "access/tableam.h"
25 : #include "access/xloginsert.h"
26 : #include "catalog/index.h"
27 : #include "commands/progress.h"
28 : #include "commands/vacuum.h"
29 : #include "miscadmin.h"
30 : #include "optimizer/plancat.h"
31 : #include "pgstat.h"
32 : #include "utils/builtins.h"
33 : #include "utils/index_selfuncs.h"
34 : #include "utils/rel.h"
35 :
36 : /* Working state for hashbuild and its callback */
37 : typedef struct
38 : {
39 : HSpool *spool; /* NULL if not using spooling */
40 : double indtuples; /* # tuples accepted into index */
41 : Relation heapRel; /* heap relation descriptor */
42 : } HashBuildState;
43 :
44 : static void hashbuildCallback(Relation index,
45 : ItemPointer tid,
46 : Datum *values,
47 : bool *isnull,
48 : bool tupleIsAlive,
49 : void *state);
50 :
51 :
52 : /*
53 : * Hash handler function: return IndexAmRoutine with access method parameters
54 : * and callbacks.
55 : */
56 : Datum
2639 tgl 57 CBC 990 : hashhandler(PG_FUNCTION_ARGS)
58 : {
59 990 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
60 :
2537 teodor 61 990 : amroutine->amstrategies = HTMaxStrategyNumber;
62 990 : amroutine->amsupport = HASHNProcs;
1105 akorotkov 63 990 : amroutine->amoptsprocnum = HASHOPTIONS_PROC;
2639 tgl 64 990 : amroutine->amcanorder = false;
65 990 : amroutine->amcanorderbyop = false;
66 990 : amroutine->amcanbackward = true;
67 990 : amroutine->amcanunique = false;
68 990 : amroutine->amcanmulticol = false;
69 990 : amroutine->amoptionalkey = false;
70 990 : amroutine->amsearcharray = false;
71 990 : amroutine->amsearchnulls = false;
72 990 : amroutine->amstorage = false;
73 990 : amroutine->amclusterable = false;
1828 teodor 74 990 : amroutine->ampredlocks = true;
2244 rhaas 75 990 : amroutine->amcanparallel = false;
1828 teodor 76 990 : amroutine->amcaninclude = false;
1180 akapila 77 990 : amroutine->amusemaintenanceworkmem = false;
20 tomas.vondra 78 GNC 990 : amroutine->amsummarizing = false;
1180 akapila 79 CBC 990 : amroutine->amparallelvacuumoptions =
1180 akapila 80 ECB : VACUUM_OPTION_PARALLEL_BULKDEL;
2639 tgl 81 GIC 990 : amroutine->amkeytype = INT4OID;
2639 tgl 82 ECB :
2639 tgl 83 GIC 990 : amroutine->ambuild = hashbuild;
2639 tgl 84 CBC 990 : amroutine->ambuildempty = hashbuildempty;
85 990 : amroutine->aminsert = hashinsert;
86 990 : amroutine->ambulkdelete = hashbulkdelete;
87 990 : amroutine->amvacuumcleanup = hashvacuumcleanup;
88 990 : amroutine->amcanreturn = NULL;
89 990 : amroutine->amcostestimate = hashcostestimate;
90 990 : amroutine->amoptions = hashoptions;
2430 91 990 : amroutine->amproperty = NULL;
1468 alvherre 92 990 : amroutine->ambuildphasename = NULL;
2639 tgl 93 990 : amroutine->amvalidate = hashvalidate;
981 94 990 : amroutine->amadjustmembers = hashadjustmembers;
2639 95 990 : amroutine->ambeginscan = hashbeginscan;
96 990 : amroutine->amrescan = hashrescan;
97 990 : amroutine->amgettuple = hashgettuple;
98 990 : amroutine->amgetbitmap = hashgetbitmap;
99 990 : amroutine->amendscan = hashendscan;
100 990 : amroutine->ammarkpos = NULL;
101 990 : amroutine->amrestrpos = NULL;
2266 rhaas 102 990 : amroutine->amestimateparallelscan = NULL;
103 990 : amroutine->aminitparallelscan = NULL;
104 990 : amroutine->amparallelrescan = NULL;
2639 tgl 105 ECB :
2639 tgl 106 GIC 990 : PG_RETURN_POINTER(amroutine);
2639 tgl 107 ECB : }
108 :
109 : /*
110 : * hashbuild() -- build a new hash index.
111 : */
112 : IndexBuildResult *
2639 tgl 113 GIC 146 : hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
9770 scrappy 114 ECB : {
115 : IndexBuildResult *result;
116 : BlockNumber relpages;
117 : double reltuples;
118 : double allvisfrac;
119 : uint32 num_buckets;
120 : long sort_threshold;
121 : HashBuildState buildstate;
122 :
123 : /*
124 : * We expect to be called exactly once for any index relation. If that's
125 : * not the case, big trouble's what we have.
126 : */
7938 tgl 127 GIC 146 : if (RelationGetNumberOfBlocks(index) != 0)
7202 tgl 128 LBC 0 : elog(ERROR, "index \"%s\" already contains data",
7938 tgl 129 EUB : RelationGetRelationName(index));
130 :
131 : /* Estimate the number of rows currently present in the table */
4195 tgl 132 GIC 146 : estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
5503 tgl 133 ECB :
134 : /* Initialize the hash index metadata page and initial buckets */
2224 rhaas 135 GIC 146 : num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
8304 tgl 136 ECB :
137 : /*
138 : * If we just insert the tuples into the index in scan order, then
139 : * (assuming their hash codes are pretty random) there will be no locality
140 : * of access to the index, and if the index is bigger than available RAM
141 : * then we'll thrash horribly. To prevent that scenario, we can sort the
142 : * tuples by (expected) bucket number. However, such a sort is useless
143 : * overhead when the index does fit in RAM. We choose to sort if the
144 : * initial index size exceeds maintenance_work_mem, or the number of
145 : * buffers usable for the index, whichever is less. (Limiting by the
146 : * number of buffers should reduce thrashing between PG buffers and kernel
147 : * buffers, which seems useful even if no physical I/O results. Limiting
148 : * by maintenance_work_mem is useful to allow easy testing of the sort
149 : * code path, and may be useful to DBAs as an additional control knob.)
150 : *
151 : * NOTE: this test will need adjustment if a bucket is ever different from
152 : * one page. Also, "initial index size" accounting does not include the
153 : * metapage, nor the first bitmap page.
154 : */
2458 tgl 155 GIC 146 : sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
2458 tgl 156 CBC 146 : if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
157 143 : sort_threshold = Min(sort_threshold, NBuffers);
2458 tgl 158 ECB : else
2458 tgl 159 GIC 3 : sort_threshold = Min(sort_threshold, NLocBuffer);
2458 tgl 160 ECB :
2458 tgl 161 GIC 146 : if (num_buckets >= (uint32) sort_threshold)
3722 tgl 162 CBC 4 : buildstate.spool = _h_spoolinit(heap, index, num_buckets);
5502 tgl 163 ECB : else
5502 tgl 164 GIC 142 : buildstate.spool = NULL;
5502 tgl 165 ECB :
166 : /* prepare to build the index */
7938 tgl 167 GIC 146 : buildstate.indtuples = 0;
2216 rhaas 168 CBC 146 : buildstate.heapRel = heap;
9345 bruce 169 ECB :
170 : /* do the heap scan */
1468 alvherre 171 GIC 146 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
1474 andres 172 ECB : hashbuildCallback,
173 : (void *) &buildstate, NULL);
1468 alvherre 174 GIC 146 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
1468 alvherre 175 CBC 146 : buildstate.indtuples);
9345 bruce 176 ECB :
5502 tgl 177 GIC 146 : if (buildstate.spool)
5502 tgl 178 ECB : {
179 : /* sort the tuples and insert them into the index */
2216 rhaas 180 GIC 4 : _h_indexbuild(buildstate.spool, buildstate.heapRel);
5502 tgl 181 CBC 4 : _h_spooldestroy(buildstate.spool);
5502 tgl 182 ECB : }
183 :
184 : /*
185 : * Return statistics
186 : */
6178 tgl 187 GIC 146 : result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
6178 tgl 188 ECB :
6178 tgl 189 GIC 146 : result->heap_tuples = reltuples;
6178 tgl 190 CBC 146 : result->index_tuples = buildstate.indtuples;
9345 bruce 191 ECB :
2639 tgl 192 GIC 146 : return result;
9770 scrappy 193 ECB : }
194 :
195 : /*
196 : * hashbuildempty() -- build an empty hash index in the initialization fork
197 : */
198 : void
2639 tgl 199 GIC 3 : hashbuildempty(Relation index)
4484 rhaas 200 ECB : {
2224 rhaas 201 GIC 3 : _hash_init(index, 0, INIT_FORKNUM);
4484 rhaas 202 CBC 3 : }
4484 rhaas 203 ECB :
204 : /*
205 : * Per-tuple callback for table_index_build_scan
206 : */
207 : static void
7938 tgl 208 GIC 247266 : hashbuildCallback(Relation index,
1248 andres 209 ECB : ItemPointer tid,
210 : Datum *values,
211 : bool *isnull,
212 : bool tupleIsAlive,
213 : void *state)
214 : {
7836 bruce 215 GIC 247266 : HashBuildState *buildstate = (HashBuildState *) state;
2480 tgl 216 ECB : Datum index_values[1];
217 : bool index_isnull[1];
218 : IndexTuple itup;
219 :
220 : /* convert data to a hash key; on failure, do not insert anything */
2480 tgl 221 GIC 247266 : if (!_hash_convert_tuple(index,
2480 tgl 222 ECB : values, isnull,
223 : index_values, index_isnull))
7938 tgl 224 UIC 0 : return;
7938 tgl 225 EUB :
226 : /* Either spool the tuple for sorting, or just put it into the index */
5502 tgl 227 GIC 247266 : if (buildstate->spool)
1248 andres 228 CBC 60500 : _h_spool(buildstate->spool, tid, index_values, index_isnull);
5502 tgl 229 ECB : else
230 : {
231 : /* form an index tuple and point it at the heap tuple */
2480 tgl 232 GIC 186766 : itup = index_form_tuple(RelationGetDescr(index),
2480 tgl 233 ECB : index_values, index_isnull);
1248 andres 234 GIC 186766 : itup->t_tid = *tid;
136 drowley 235 GNC 186766 : _hash_doinsert(index, itup, buildstate->heapRel, false);
3204 rhaas 236 CBC 186766 : pfree(itup);
3204 rhaas 237 ECB : }
238 :
7938 tgl 239 GIC 247266 : buildstate->indtuples += 1;
7938 tgl 240 ECB : }
241 :
242 : /*
243 : * hashinsert() -- insert an index tuple into a hash table.
244 : *
245 : * Hash on the heap tuple's key, form an index tuple with hash code.
246 : * Find the appropriate location for the new tuple, and put it there.
247 : */
248 : bool
2639 tgl 249 GIC 98244 : hashinsert(Relation rel, Datum *values, bool *isnull,
2639 tgl 250 ECB : ItemPointer ht_ctid, Relation heapRel,
251 : IndexUniqueCheck checkUnique,
252 : bool indexUnchanged,
253 : IndexInfo *indexInfo)
254 : {
255 : Datum index_values[1];
256 : bool index_isnull[1];
257 : IndexTuple itup;
258 :
259 : /* convert data to a hash key; on failure, do not insert anything */
2480 tgl 260 GIC 98244 : if (!_hash_convert_tuple(rel,
2480 tgl 261 ECB : values, isnull,
262 : index_values, index_isnull))
2639 tgl 263 UIC 0 : return false;
3204 rhaas 264 EUB :
265 : /* form an index tuple and point it at the heap tuple */
2480 tgl 266 GIC 98244 : itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
3204 rhaas 267 CBC 98244 : itup->t_tid = *ht_ctid;
9345 bruce 268 ECB :
136 drowley 269 GNC 98244 : _hash_doinsert(rel, itup, heapRel, false);
9345 bruce 270 ECB :
9345 bruce 271 GIC 98238 : pfree(itup);
9345 bruce 272 ECB :
2639 tgl 273 GIC 98238 : return false;
9770 scrappy 274 ECB : }
275 :
276 :
277 : /*
278 : * hashgettuple() -- Get the next tuple in the scan.
279 : */
280 : bool
2639 tgl 281 GIC 50777 : hashgettuple(IndexScanDesc scan, ScanDirection dir)
9770 scrappy 282 ECB : {
7625 tgl 283 GIC 50777 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
7522 bruce 284 ECB : bool res;
285 :
286 : /* Hash indexes are always lossy since we store only the hash code */
5319 tgl 287 GIC 50777 : scan->xs_recheck = true;
5474 tgl 288 ECB :
289 : /*
290 : * If we've already initialized this scan, we can just advance it in the
291 : * appropriate direction. If we haven't done so yet, we call a routine to
292 : * get the first item in the scan.
293 : */
2025 rhaas 294 GIC 50777 : if (!HashScanPosIsValid(so->currPos))
2025 rhaas 295 CBC 263 : res = _hash_first(scan, dir);
2025 rhaas 296 ECB : else
297 : {
298 : /*
299 : * Check to see if we should kill the previously-fetched tuple.
300 : */
7625 tgl 301 GIC 50514 : if (scan->kill_prior_tuple)
7625 tgl 302 ECB : {
303 : /*
304 : * Yes, so remember it for later. (We'll deal with all such tuples
305 : * at once right after leaving the index page or at end of scan.)
306 : * In case if caller reverses the indexscan direction it is quite
307 : * possible that the same item might get entered multiple times.
308 : * But, we don't detect that; instead, we just forget any excess
309 : * entries.
310 : */
2216 rhaas 311 UIC 0 : if (so->killedItems == NULL)
2025 rhaas 312 UBC 0 : so->killedItems = (int *)
313 0 : palloc(MaxIndexTuplesPerPage * sizeof(int));
7522 bruce 314 EUB :
2216 rhaas 315 UIC 0 : if (so->numKilled < MaxIndexTuplesPerPage)
2025 rhaas 316 UBC 0 : so->killedItems[so->numKilled++] = so->currPos.itemIndex;
7625 tgl 317 EUB : }
318 :
319 : /*
320 : * Now continue the scan.
321 : */
9345 bruce 322 GIC 50514 : res = _hash_next(scan, dir);
7625 tgl 323 ECB : }
324 :
2639 tgl 325 GIC 50776 : return res;
9770 scrappy 326 ECB : }
327 :
328 :
329 : /*
330 : * hashgetbitmap() -- get all tuples at once
331 : */
332 : int64
2639 tgl 333 GIC 30 : hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
6587 tgl 334 ECB : {
6587 tgl 335 GIC 30 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
5477 tgl 336 ECB : bool res;
5477 tgl 337 GIC 30 : int64 ntids = 0;
2025 rhaas 338 ECB : HashScanPosItem *currItem;
339 :
5477 tgl 340 GIC 30 : res = _hash_first(scan, ForwardScanDirection);
6587 tgl 341 ECB :
5477 tgl 342 GIC 93 : while (res)
6587 tgl 343 ECB : {
2025 rhaas 344 GIC 63 : currItem = &so->currPos.items[so->currPos.itemIndex];
5477 tgl 345 ECB :
346 : /*
347 : * _hash_first and _hash_next handle eliminate dead index entries
348 : * whenever scan->ignore_killed_tuples is true. Therefore, there's
349 : * nothing to do here except add the results to the TIDBitmap.
350 : */
2025 rhaas 351 GIC 63 : tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
2025 rhaas 352 CBC 63 : ntids++;
6587 tgl 353 ECB :
5477 tgl 354 GIC 63 : res = _hash_next(scan, ForwardScanDirection);
5477 tgl 355 ECB : }
356 :
2639 tgl 357 GIC 30 : return ntids;
6587 tgl 358 ECB : }
359 :
360 :
361 : /*
362 : * hashbeginscan() -- start a scan on a hash index
363 : */
364 : IndexScanDesc
2639 tgl 365 GIC 185 : hashbeginscan(Relation rel, int nkeys, int norderbys)
9770 scrappy 366 ECB : {
367 : IndexScanDesc scan;
368 : HashScanOpaque so;
369 :
370 : /* no order by operators allowed */
4511 tgl 371 GIC 185 : Assert(norderbys == 0);
4511 tgl 372 ECB :
4511 tgl 373 GIC 185 : scan = RelationGetIndexScan(rel, nkeys, norderbys);
4511 tgl 374 ECB :
9345 bruce 375 GIC 185 : so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
2025 rhaas 376 CBC 185 : HashScanPosInvalidate(so->currPos);
2321 377 185 : so->hashso_bucket_buf = InvalidBuffer;
378 185 : so->hashso_split_bucket_buf = InvalidBuffer;
5923 neilc 379 ECB :
2321 rhaas 380 GIC 185 : so->hashso_buc_populated = false;
2321 rhaas 381 CBC 185 : so->hashso_buc_split = false;
9345 bruce 382 ECB :
2216 rhaas 383 GIC 185 : so->killedItems = NULL;
2216 rhaas 384 CBC 185 : so->numKilled = 0;
2216 rhaas 385 ECB :
2321 rhaas 386 GIC 185 : scan->opaque = so;
9345 bruce 387 ECB :
2639 tgl 388 GIC 185 : return scan;
9770 scrappy 389 ECB : }
390 :
391 : /*
392 : * hashrescan() -- rescan an index relation
393 : */
394 : void
2639 tgl 395 GIC 290 : hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
2639 tgl 396 ECB : ScanKey orderbys, int norderbys)
397 : {
7322 tgl 398 GIC 290 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
7157 tgl 399 CBC 290 : Relation rel = scan->indexRelation;
9345 bruce 400 ECB :
2025 rhaas 401 GIC 290 : if (HashScanPosIsValid(so->currPos))
2197 rhaas 402 ECB : {
403 : /* Before leaving current page, deal with any killed items */
2025 rhaas 404 GIC 42 : if (so->numKilled > 0)
2025 rhaas 405 LBC 0 : _hash_kill_items(scan);
2197 rhaas 406 EUB : }
407 :
2321 rhaas 408 GIC 290 : _hash_dropscanbuf(rel, so);
4511 tgl 409 ECB :
410 : /* set position invalid (this will cause _hash_first call) */
2025 rhaas 411 GIC 290 : HashScanPosInvalidate(so->currPos);
7157 tgl 412 ECB :
413 : /* Update scan key, if a new one is given */
7322 tgl 414 GIC 290 : if (scankey && scan->numberOfKeys > 0)
9345 bruce 415 ECB : {
9345 bruce 416 GIC 290 : memmove(scan->keyData,
9345 bruce 417 ECB : scankey,
9345 bruce 418 GIC 290 : scan->numberOfKeys * sizeof(ScanKeyData));
9345 bruce 419 ECB : }
420 :
2321 rhaas 421 GIC 290 : so->hashso_buc_populated = false;
2321 rhaas 422 CBC 290 : so->hashso_buc_split = false;
9770 scrappy 423 290 : }
9770 scrappy 424 ECB :
425 : /*
426 : * hashendscan() -- close down a scan
427 : */
428 : void
2639 tgl 429 GIC 184 : hashendscan(IndexScanDesc scan)
9770 scrappy 430 ECB : {
7157 tgl 431 GIC 184 : HashScanOpaque so = (HashScanOpaque) scan->opaque;
7157 tgl 432 CBC 184 : Relation rel = scan->indexRelation;
9345 bruce 433 ECB :
2025 rhaas 434 GIC 184 : if (HashScanPosIsValid(so->currPos))
2197 rhaas 435 ECB : {
436 : /* Before leaving current page, deal with any killed items */
2025 rhaas 437 GIC 30 : if (so->numKilled > 0)
2025 rhaas 438 LBC 0 : _hash_kill_items(scan);
2197 rhaas 439 EUB : }
440 :
2321 rhaas 441 GIC 184 : _hash_dropscanbuf(rel, so);
9345 bruce 442 ECB :
2216 rhaas 443 GIC 184 : if (so->killedItems != NULL)
2216 rhaas 444 LBC 0 : pfree(so->killedItems);
7157 tgl 445 GBC 184 : pfree(so);
7157 tgl 446 CBC 184 : scan->opaque = NULL;
9770 scrappy 447 184 : }
9770 scrappy 448 ECB :
449 : /*
450 : * Bulk deletion of all index entries pointing to a set of heap tuples.
451 : * The set of target tuples is specified via a callback routine that tells
452 : * whether any given heap tuple (identified by ItemPointer) is being deleted.
453 : *
454 : * This function also deletes the tuples that are moved by split to other
455 : * bucket.
456 : *
457 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
458 : */
459 : IndexBulkDeleteResult *
2639 tgl 460 GIC 12 : hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
2639 tgl 461 ECB : IndexBulkDeleteCallback callback, void *callback_state)
462 : {
6186 tgl 463 GIC 12 : Relation rel = info->index;
7938 tgl 464 ECB : double tuples_removed;
465 : double num_index_tuples;
466 : double orig_ntuples;
467 : Bucket orig_maxbucket;
468 : Bucket cur_maxbucket;
469 : Bucket cur_bucket;
2252 rhaas 470 GIC 12 : Buffer metabuf = InvalidBuffer;
7159 tgl 471 ECB : HashMetaPage metap;
472 : HashMetaPage cachedmetap;
473 :
7938 tgl 474 GIC 12 : tuples_removed = 0;
7938 tgl 475 CBC 12 : num_index_tuples = 0;
9770 scrappy 476 ECB :
477 : /*
478 : * We need a copy of the metapage so that we can use its hashm_spares[]
479 : * values to compute bucket page addresses, but a cached copy should be
480 : * good enough. (If not, we'll detect that further down and refresh the
481 : * cache as necessary.)
482 : */
2252 rhaas 483 GIC 12 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
2252 rhaas 484 CBC 12 : Assert(cachedmetap != NULL);
2252 rhaas 485 ECB :
2252 rhaas 486 GIC 12 : orig_maxbucket = cachedmetap->hashm_maxbucket;
2252 rhaas 487 CBC 12 : orig_ntuples = cachedmetap->hashm_ntuples;
7159 tgl 488 ECB :
489 : /* Scan the buckets that we know exist */
7159 tgl 490 GIC 12 : cur_bucket = 0;
7159 tgl 491 CBC 12 : cur_maxbucket = orig_maxbucket;
7159 tgl 492 ECB :
7159 tgl 493 GIC 12 : loop_top:
7159 tgl 494 CBC 318 : while (cur_bucket <= cur_maxbucket)
7159 tgl 495 ECB : {
496 : BlockNumber bucket_blkno;
497 : BlockNumber blkno;
498 : Buffer bucket_buf;
499 : Buffer buf;
500 : HashPageOpaque bucket_opaque;
501 : Page page;
2321 rhaas 502 GIC 306 : bool split_cleanup = false;
8335 tgl 503 ECB :
504 : /* Get address of bucket's start page */
2252 rhaas 505 GIC 306 : bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
7938 tgl 506 ECB :
7159 tgl 507 GIC 306 : blkno = bucket_blkno;
7159 tgl 508 ECB :
509 : /*
510 : * We need to acquire a cleanup lock on the primary bucket page to out
511 : * wait concurrent scans before deleting the dead tuples.
512 : */
2321 rhaas 513 GIC 306 : buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
2321 rhaas 514 CBC 306 : LockBufferForCleanup(buf);
515 306 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
7938 tgl 516 ECB :
2321 rhaas 517 GIC 306 : page = BufferGetPage(buf);
373 michael 518 CBC 306 : bucket_opaque = HashPageGetOpaque(page);
7159 tgl 519 ECB :
520 : /*
521 : * If the bucket contains tuples that are moved by split, then we need
522 : * to delete such tuples. We can't delete such tuples if the split
523 : * operation on bucket is not finished as those are needed by scans.
524 : */
2321 rhaas 525 GIC 306 : if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
2321 rhaas 526 CBC 306 : H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
2308 rhaas 527 ECB : {
2321 rhaas 528 UIC 0 : split_cleanup = true;
2321 rhaas 529 EUB :
530 : /*
531 : * This bucket might have been split since we last held a lock on
532 : * the metapage. If so, hashm_maxbucket, hashm_highmask and
533 : * hashm_lowmask might be old enough to cause us to fail to remove
534 : * tuples left behind by the most recent split. To prevent that,
535 : * now that the primary page of the target bucket has been locked
536 : * (and thus can't be further split), check whether we need to
537 : * update our cached metapage data.
538 : */
2161 rhaas 539 UIC 0 : Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
2161 rhaas 540 UBC 0 : if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
2252 rhaas 541 EUB : {
2252 rhaas 542 UIC 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
2252 rhaas 543 UBC 0 : Assert(cachedmetap != NULL);
2252 rhaas 544 EUB : }
545 : }
546 :
2321 rhaas 547 GIC 306 : bucket_buf = buf;
7938 tgl 548 ECB :
2321 rhaas 549 GIC 306 : hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
2252 rhaas 550 ECB : cachedmetap->hashm_maxbucket,
551 : cachedmetap->hashm_highmask,
552 : cachedmetap->hashm_lowmask, &tuples_removed,
553 : &num_index_tuples, split_cleanup,
554 : callback, callback_state);
555 :
2321 rhaas 556 GIC 306 : _hash_dropbuf(rel, bucket_buf);
7159 tgl 557 ECB :
558 : /* Advance to next bucket */
7159 tgl 559 GIC 306 : cur_bucket++;
7159 tgl 560 ECB : }
561 :
2252 rhaas 562 GIC 12 : if (BufferIsInvalid(metabuf))
2252 rhaas 563 CBC 6 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
2252 rhaas 564 ECB :
565 : /* Write-lock metapage and check for split since we started */
2298 rhaas 566 GIC 12 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
2545 kgrittn 567 CBC 12 : metap = HashPageGetMeta(BufferGetPage(metabuf));
7159 tgl 568 ECB :
7159 tgl 569 GIC 12 : if (cur_maxbucket != metap->hashm_maxbucket)
7159 tgl 570 ECB : {
571 : /* There's been a split, so process the additional bucket(s) */
2298 rhaas 572 UIC 0 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
2252 rhaas 573 UBC 0 : cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
574 0 : Assert(cachedmetap != NULL);
575 0 : cur_maxbucket = cachedmetap->hashm_maxbucket;
7159 tgl 576 0 : goto loop_top;
7159 tgl 577 EUB : }
578 :
579 : /* Okay, we're really done. Update tuple count in metapage. */
2217 rhaas 580 GIC 12 : START_CRIT_SECTION();
7159 tgl 581 ECB :
7159 tgl 582 GIC 12 : if (orig_maxbucket == metap->hashm_maxbucket &&
7159 tgl 583 CBC 12 : orig_ntuples == metap->hashm_ntuples)
7159 tgl 584 ECB : {
585 : /*
586 : * No one has split or inserted anything since start of scan, so
587 : * believe our count as gospel.
588 : */
7159 tgl 589 GIC 6 : metap->hashm_ntuples = num_index_tuples;
7159 tgl 590 ECB : }
591 : else
592 : {
593 : /*
594 : * Otherwise, our count is untrustworthy since we may have
595 : * double-scanned tuples in split buckets. Proceed by dead-reckoning.
596 : * (Note: we still return estimated_count = false, because using this
597 : * count is better than not updating reltuples at all.)
598 : */
7159 tgl 599 GIC 6 : if (metap->hashm_ntuples > tuples_removed)
7159 tgl 600 CBC 5 : metap->hashm_ntuples -= tuples_removed;
7938 tgl 601 ECB : else
7159 tgl 602 GIC 1 : metap->hashm_ntuples = 0;
7159 tgl 603 CBC 6 : num_index_tuples = metap->hashm_ntuples;
7938 tgl 604 ECB : }
605 :
2305 rhaas 606 GIC 12 : MarkBufferDirty(metabuf);
2217 rhaas 607 ECB :
608 : /* XLOG stuff */
2217 rhaas 609 GIC 12 : if (RelationNeedsWAL(rel))
2217 rhaas 610 ECB : {
611 : xl_hash_update_meta_page xlrec;
612 : XLogRecPtr recptr;
613 :
2217 rhaas 614 GIC 12 : xlrec.ntuples = metap->hashm_ntuples;
2217 rhaas 615 ECB :
2217 rhaas 616 GIC 12 : XLogBeginInsert();
2195 rhaas 617 CBC 12 : XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
2217 rhaas 618 ECB :
2217 rhaas 619 GIC 12 : XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
2217 rhaas 620 ECB :
2217 rhaas 621 GIC 12 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
2217 rhaas 622 CBC 12 : PageSetLSN(BufferGetPage(metabuf), recptr);
2217 rhaas 623 ECB : }
624 :
2217 rhaas 625 GIC 12 : END_CRIT_SECTION();
2217 rhaas 626 ECB :
2305 rhaas 627 GIC 12 : _hash_relbuf(rel, metabuf);
7938 tgl 628 ECB :
629 : /* return statistics */
6186 tgl 630 GIC 12 : if (stats == NULL)
6186 tgl 631 CBC 12 : stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
5055 632 12 : stats->estimated_count = false;
6186 633 12 : stats->num_index_tuples = num_index_tuples;
634 12 : stats->tuples_removed += tuples_removed;
6186 tgl 635 ECB : /* hashvacuumcleanup will fill in num_pages */
636 :
2639 tgl 637 GIC 12 : return stats;
6186 tgl 638 ECB : }
639 :
640 : /*
641 : * Post-VACUUM cleanup.
642 : *
643 : * Result: a palloc'd struct containing statistical info for VACUUM displays.
644 : */
645 : IndexBulkDeleteResult *
2639 tgl 646 GIC 23 : hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
6186 tgl 647 ECB : {
6186 tgl 648 GIC 23 : Relation rel = info->index;
6186 tgl 649 ECB : BlockNumber num_pages;
650 :
651 : /* If hashbulkdelete wasn't called, return NULL signifying no change */
652 : /* Note: this covers the analyze_only case too */
6186 tgl 653 GIC 23 : if (stats == NULL)
2639 tgl 654 CBC 11 : return NULL;
6186 tgl 655 ECB :
656 : /* update statistics */
6186 tgl 657 GIC 12 : num_pages = RelationGetNumberOfBlocks(rel);
6186 tgl 658 CBC 12 : stats->num_pages = num_pages;
7938 tgl 659 ECB :
2639 tgl 660 GIC 12 : return stats;
9345 bruce 661 ECB : }
662 :
663 : /*
664 : * Helper function to perform deletion of index entries from a bucket.
665 : *
666 : * This function expects that the caller has acquired a cleanup lock on the
667 : * primary bucket page, and will return with a write lock again held on the
668 : * primary bucket page. The lock won't necessarily be held continuously,
669 : * though, because we'll release it when visiting overflow pages.
670 : *
671 : * There can't be any concurrent scans in progress when we first enter this
672 : * function because of the cleanup lock we hold on the primary bucket page,
673 : * but as soon as we release that lock, there might be. If those scans got
674 : * ahead of our cleanup scan, they might see a tuple before we kill it and
675 : * wake up only after VACUUM has completed and the TID has been recycled for
676 : * an unrelated tuple. To avoid that calamity, we prevent scans from passing
677 : * our cleanup scan by locking the next page in the bucket chain before
678 : * releasing the lock on the previous page. (This type of lock chaining is not
679 : * ideal, so we might want to look for a better solution at some point.)
680 : *
681 : * We need to retain a pin on the primary bucket to ensure that no concurrent
682 : * split can start.
683 : */
684 : void
2321 rhaas 685 GIC 972 : hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
2321 rhaas 686 ECB : BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
687 : uint32 maxbucket, uint32 highmask, uint32 lowmask,
688 : double *tuples_removed, double *num_index_tuples,
689 : bool split_cleanup,
690 : IndexBulkDeleteCallback callback, void *callback_state)
691 : {
692 : BlockNumber blkno;
693 : Buffer buf;
2118 tgl 694 GIC 972 : Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
2321 rhaas 695 CBC 972 : bool bucket_dirty = false;
2321 rhaas 696 ECB :
2321 rhaas 697 GIC 972 : blkno = bucket_blkno;
2321 rhaas 698 CBC 972 : buf = bucket_buf;
2321 rhaas 699 ECB :
2321 rhaas 700 GIC 972 : if (split_cleanup)
2321 rhaas 701 CBC 666 : new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
2321 rhaas 702 ECB : lowmask, maxbucket);
703 :
704 : /* Scan each page in bucket */
705 : for (;;)
2321 rhaas 706 GIC 173 : {
2321 rhaas 707 ECB : HashPageOpaque opaque;
708 : OffsetNumber offno;
709 : OffsetNumber maxoffno;
710 : Buffer next_buf;
711 : Page page;
712 : OffsetNumber deletable[MaxOffsetNumber];
2321 rhaas 713 GIC 1145 : int ndeletable = 0;
2321 rhaas 714 CBC 1145 : bool retain_pin = false;
2211 715 1145 : bool clear_dead_marking = false;
2321 rhaas 716 ECB :
2321 rhaas 717 GIC 1145 : vacuum_delay_point();
2321 rhaas 718 ECB :
2321 rhaas 719 GIC 1145 : page = BufferGetPage(buf);
373 michael 720 CBC 1145 : opaque = HashPageGetOpaque(page);
2321 rhaas 721 ECB :
722 : /* Scan each tuple in page */
2321 rhaas 723 GIC 1145 : maxoffno = PageGetMaxOffsetNumber(page);
2321 rhaas 724 CBC 1145 : for (offno = FirstOffsetNumber;
725 224602 : offno <= maxoffno;
726 223457 : offno = OffsetNumberNext(offno))
2321 rhaas 727 ECB : {
728 : ItemPointer htup;
729 : IndexTuple itup;
730 : Bucket bucket;
2321 rhaas 731 GIC 223457 : bool kill_tuple = false;
2321 rhaas 732 ECB :
2321 rhaas 733 GIC 223457 : itup = (IndexTuple) PageGetItem(page,
2321 rhaas 734 ECB : PageGetItemId(page, offno));
2321 rhaas 735 GIC 223457 : htup = &(itup->t_tid);
2321 rhaas 736 ECB :
737 : /*
738 : * To remove the dead tuples, we strictly want to rely on results
739 : * of callback function. refer btvacuumpage for detailed reason.
740 : */
2321 rhaas 741 GIC 223457 : if (callback && callback(htup, callback_state))
2321 rhaas 742 ECB : {
2321 rhaas 743 GIC 5846 : kill_tuple = true;
2321 rhaas 744 CBC 5846 : if (tuples_removed)
745 5846 : *tuples_removed += 1;
2321 rhaas 746 ECB : }
2321 rhaas 747 GIC 217611 : else if (split_cleanup)
2321 rhaas 748 ECB : {
749 : /* delete the tuples that are moved by split. */
2321 rhaas 750 GIC 152611 : bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
2321 rhaas 751 ECB : maxbucket,
752 : highmask,
753 : lowmask);
754 : /* mark the item for deletion */
2321 rhaas 755 GIC 152611 : if (bucket != cur_bucket)
2321 rhaas 756 ECB : {
757 : /*
758 : * We expect tuples to either belong to current bucket or
759 : * new_bucket. This is ensured because we don't allow
760 : * further splits from bucket that contains garbage. See
761 : * comments in _hash_expandtable.
762 : */
2321 rhaas 763 GIC 62483 : Assert(bucket == new_bucket);
2321 rhaas 764 CBC 62483 : kill_tuple = true;
2321 rhaas 765 ECB : }
766 : }
767 :
2321 rhaas 768 GIC 223457 : if (kill_tuple)
2321 rhaas 769 ECB : {
770 : /* mark the item for deletion */
2321 rhaas 771 GIC 68329 : deletable[ndeletable++] = offno;
2321 rhaas 772 ECB : }
773 : else
774 : {
775 : /* we're keeping it, so count it */
2321 rhaas 776 GIC 155128 : if (num_index_tuples)
2321 rhaas 777 CBC 65000 : *num_index_tuples += 1;
2321 rhaas 778 ECB : }
779 : }
780 :
781 : /* retain the pin on primary bucket page till end of bucket scan */
2321 rhaas 782 GIC 1145 : if (blkno == bucket_blkno)
2321 rhaas 783 CBC 972 : retain_pin = true;
2321 rhaas 784 ECB : else
2321 rhaas 785 GIC 173 : retain_pin = false;
2321 rhaas 786 ECB :
2321 rhaas 787 GIC 1145 : blkno = opaque->hasho_nextblkno;
2321 rhaas 788 ECB :
789 : /*
790 : * Apply deletions, advance to next page and write page if needed.
791 : */
2321 rhaas 792 GIC 1145 : if (ndeletable > 0)
2321 rhaas 793 ECB : {
794 : /* No ereport(ERROR) until changes are logged */
2217 rhaas 795 GIC 745 : START_CRIT_SECTION();
2217 rhaas 796 ECB :
2321 rhaas 797 GIC 745 : PageIndexMultiDelete(page, deletable, ndeletable);
2321 rhaas 798 CBC 745 : bucket_dirty = true;
2216 rhaas 799 ECB :
800 : /*
801 : * Let us mark the page as clean if vacuum removes the DEAD tuples
802 : * from an index page. We do this by clearing
803 : * LH_PAGE_HAS_DEAD_TUPLES flag.
804 : */
2216 rhaas 805 GIC 745 : if (tuples_removed && *tuples_removed > 0 &&
2185 tgl 806 CBC 51 : H_HAS_DEAD_TUPLES(opaque))
2211 rhaas 807 ECB : {
2216 rhaas 808 UIC 0 : opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
2211 rhaas 809 UBC 0 : clear_dead_marking = true;
2211 rhaas 810 EUB : }
811 :
2305 rhaas 812 GIC 745 : MarkBufferDirty(buf);
2217 rhaas 813 ECB :
814 : /* XLOG stuff */
2217 rhaas 815 GIC 745 : if (RelationNeedsWAL(rel))
2217 rhaas 816 ECB : {
817 : xl_hash_delete xlrec;
818 : XLogRecPtr recptr;
819 :
2211 rhaas 820 GIC 619 : xlrec.clear_dead_marking = clear_dead_marking;
578 michael 821 CBC 619 : xlrec.is_primary_bucket_page = (buf == bucket_buf);
2217 rhaas 822 ECB :
2217 rhaas 823 GIC 619 : XLogBeginInsert();
2217 rhaas 824 CBC 619 : XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
2217 rhaas 825 ECB :
826 : /*
827 : * bucket buffer needs to be registered to ensure that we can
828 : * acquire a cleanup lock on it during replay.
829 : */
2217 rhaas 830 GIC 619 : if (!xlrec.is_primary_bucket_page)
2217 rhaas 831 CBC 67 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
2217 rhaas 832 ECB :
2217 rhaas 833 GIC 619 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
2217 rhaas 834 CBC 619 : XLogRegisterBufData(1, (char *) deletable,
2217 rhaas 835 ECB : ndeletable * sizeof(OffsetNumber));
836 :
2217 rhaas 837 GIC 619 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
2217 rhaas 838 CBC 619 : PageSetLSN(BufferGetPage(buf), recptr);
2217 rhaas 839 ECB : }
840 :
2217 rhaas 841 GIC 745 : END_CRIT_SECTION();
2321 rhaas 842 ECB : }
843 :
844 : /* bail out if there are no more pages to scan. */
2321 rhaas 845 GIC 1145 : if (!BlockNumberIsValid(blkno))
2321 rhaas 846 CBC 972 : break;
2321 rhaas 847 ECB :
2321 rhaas 848 GIC 173 : next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
2321 rhaas 849 ECB : LH_OVERFLOW_PAGE,
850 : bstrategy);
851 :
852 : /*
853 : * release the lock on previous page after acquiring the lock on next
854 : * page
855 : */
2305 rhaas 856 GIC 173 : if (retain_pin)
2298 rhaas 857 CBC 32 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2321 rhaas 858 ECB : else
2321 rhaas 859 GIC 141 : _hash_relbuf(rel, buf);
2321 rhaas 860 ECB :
2321 rhaas 861 GIC 173 : buf = next_buf;
2321 rhaas 862 ECB : }
863 :
864 : /*
865 : * lock the bucket page to clear the garbage flag and squeeze the bucket.
866 : * if the current buffer is same as bucket buffer, then we already have
867 : * lock on bucket page.
868 : */
2321 rhaas 869 GIC 972 : if (buf != bucket_buf)
2321 rhaas 870 ECB : {
2321 rhaas 871 GIC 32 : _hash_relbuf(rel, buf);
2298 rhaas 872 CBC 32 : LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
2321 rhaas 873 ECB : }
874 :
875 : /*
876 : * Clear the garbage flag from bucket after deleting the tuples that are
877 : * moved by split. We purposefully clear the flag before squeeze bucket,
878 : * so that after restart, vacuum shouldn't again try to delete the moved
879 : * by split tuples.
880 : */
2321 rhaas 881 GIC 972 : if (split_cleanup)
2321 rhaas 882 ECB : {
883 : HashPageOpaque bucket_opaque;
884 : Page page;
885 :
2321 rhaas 886 GIC 666 : page = BufferGetPage(bucket_buf);
373 michael 887 CBC 666 : bucket_opaque = HashPageGetOpaque(page);
2321 rhaas 888 ECB :
889 : /* No ereport(ERROR) until changes are logged */
2217 rhaas 890 GIC 666 : START_CRIT_SECTION();
2217 rhaas 891 ECB :
2321 rhaas 892 GIC 666 : bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
2305 rhaas 893 CBC 666 : MarkBufferDirty(bucket_buf);
2217 rhaas 894 ECB :
895 : /* XLOG stuff */
2217 rhaas 896 GIC 666 : if (RelationNeedsWAL(rel))
2217 rhaas 897 ECB : {
898 : XLogRecPtr recptr;
899 :
2217 rhaas 900 GIC 540 : XLogBeginInsert();
2217 rhaas 901 CBC 540 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
2217 rhaas 902 ECB :
2217 rhaas 903 GIC 540 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
2217 rhaas 904 CBC 540 : PageSetLSN(page, recptr);
2217 rhaas 905 ECB : }
906 :
2217 rhaas 907 GIC 666 : END_CRIT_SECTION();
2321 rhaas 908 ECB : }
909 :
910 : /*
911 : * If we have deleted anything, try to compact free space. For squeezing
912 : * the bucket, we must have a cleanup lock, else it can impact the
913 : * ordering of tuples for a scan that has started before it.
914 : */
2321 rhaas 915 GIC 972 : if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
2321 rhaas 916 CBC 684 : _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
2321 rhaas 917 ECB : bstrategy);
918 : else
2298 rhaas 919 GIC 288 : LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
2321 rhaas 920 CBC 972 : }
|