Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * hashovfl.c
4 : * Overflow page management code for the Postgres hash access method
5 : *
6 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/access/hash/hashovfl.c
12 : *
13 : * NOTES
14 : * Overflow pages look like ordinary relation pages.
15 : *
16 : *-------------------------------------------------------------------------
17 : */
18 : #include "postgres.h"
19 :
20 : #include "access/hash.h"
21 : #include "access/hash_xlog.h"
22 : #include "access/xloginsert.h"
23 : #include "miscadmin.h"
24 : #include "utils/rel.h"
25 :
26 :
27 : static uint32 _hash_firstfreebit(uint32 map);
28 :
29 :
30 : /*
31 : * Convert overflow page bit number (its index in the free-page bitmaps)
32 : * to block number within the index.
33 : */
34 : static BlockNumber
7160 tgl 35 CBC 161 : bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
36 : {
37 161 : uint32 splitnum = metap->hashm_ovflpoint;
38 : uint32 i;
39 :
40 : /* Convert zero-based bitnumber to 1-based page number */
41 161 : ovflbitnum += 1;
42 :
43 : /* Determine the split number for this page (must be >= 1) */
44 161 : for (i = 1;
45 975 : i < splitnum && ovflbitnum > metap->hashm_spares[i];
46 814 : i++)
47 : /* loop */ ;
48 :
49 : /*
50 : * Convert to absolute page number by adding the number of bucket pages
51 : * that exist before this split point.
52 : */
2197 rhaas 53 161 : return (BlockNumber) (_hash_get_totalbuckets(i) + ovflbitnum);
54 : }
55 :
56 : /*
57 : * _hash_ovflblkno_to_bitno
58 : *
59 : * Convert overflow page block number to bit number for free-page bitmap.
60 : */
61 : uint32
2257 62 69 : _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
63 : {
7160 tgl 64 69 : uint32 splitnum = metap->hashm_ovflpoint;
65 : uint32 i;
66 : uint32 bitnum;
67 :
68 : /* Determine the split number containing this page */
69 270 : for (i = 1; i <= splitnum; i++)
70 : {
2197 rhaas 71 270 : if (ovflblkno <= (BlockNumber) _hash_get_totalbuckets(i))
7160 tgl 72 4 : break; /* oops */
2197 rhaas 73 266 : bitnum = ovflblkno - _hash_get_totalbuckets(i);
74 :
75 : /*
76 : * bitnum has to be greater than number of overflow page added in
77 : * previous split point. The overflow page at this splitnum (i) if any
78 : * should start from (_hash_get_totalbuckets(i) +
79 : * metap->hashm_spares[i - 1] + 1).
80 : */
2250 81 266 : if (bitnum > metap->hashm_spares[i - 1] &&
82 266 : bitnum <= metap->hashm_spares[i])
7160 tgl 83 65 : return bitnum - 1; /* -1 to convert 1-based to 0-based */
84 : }
85 :
2250 rhaas 86 4 : ereport(ERROR,
87 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
88 : errmsg("invalid overflow block number %u", ovflblkno)));
89 : return 0; /* keep compiler quiet */
90 : }
91 :
92 : /*
93 : * _hash_addovflpage
94 : *
95 : * Add an overflow page to the bucket whose last page is pointed to by 'buf'.
96 : *
97 : * On entry, the caller must hold a pin but no lock on 'buf'. The pin is
98 : * dropped before exiting (we assume the caller is not interested in 'buf'
99 : * anymore) if not asked to retain. The pin will be retained only for the
100 : * primary bucket. The returned overflow page will be pinned and
101 : * write-locked; it is guaranteed to be empty.
102 : *
103 : * The caller must hold a pin, but no lock, on the metapage buffer.
104 : * That buffer is returned in the same state.
105 : *
106 : * NB: since this could be executed concurrently by multiple processes,
107 : * one should not assume that the returned overflow page will be the
108 : * immediate successor of the originally passed 'buf'. Additional overflow
109 : * pages might have been added to the bucket chain in between.
110 : */
111 : Buffer
2321 112 161 : _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
113 : {
114 : Buffer ovflbuf;
115 : Page page;
116 : Page ovflpage;
117 : HashPageOpaque pageopaque;
118 : HashPageOpaque ovflopaque;
119 : HashMetaPage metap;
2232 120 161 : Buffer mapbuf = InvalidBuffer;
121 161 : Buffer newmapbuf = InvalidBuffer;
122 : BlockNumber blkno;
123 : uint32 orig_firstfree;
124 : uint32 splitnum;
125 161 : uint32 *freep = NULL;
126 : uint32 max_ovflpg;
127 : uint32 bit;
128 : uint32 bitmap_page_bit;
129 : uint32 first_page;
130 : uint32 last_bit;
131 : uint32 last_page;
132 : uint32 i,
133 : j;
134 161 : bool page_found = false;
135 :
136 : /*
137 : * Write-lock the tail page. Here, we need to maintain locking order such
138 : * that, first acquire the lock on tail page of bucket, then on meta page
139 : * to find and lock the bitmap page and if it is found, then lock on meta
140 : * page is released, then finally acquire the lock on new overflow buffer.
141 : * We need this locking order to avoid deadlock with backends that are
142 : * doing inserts.
143 : *
144 : * Note: We could have avoided locking many buffers here if we made two
145 : * WAL records for acquiring an overflow page (one to allocate an overflow
146 : * page and another to add it to overflow bucket chain). However, doing
147 : * so can leak an overflow page, if the system crashes after allocation.
148 : * Needless to say, it is better to have a single record from a
149 : * performance point of view as well.
150 : */
2298 151 161 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
152 :
153 : /* probably redundant... */
5820 tgl 154 161 : _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
155 :
156 : /* loop to find current tail page, in case someone else inserted too */
157 : for (;;)
7157 tgl 158 UBC 0 : {
159 : BlockNumber nextblkno;
160 :
2545 kgrittn 161 CBC 161 : page = BufferGetPage(buf);
373 michael 162 161 : pageopaque = HashPageGetOpaque(page);
7157 tgl 163 161 : nextblkno = pageopaque->hasho_nextblkno;
164 :
165 161 : if (!BlockNumberIsValid(nextblkno))
166 161 : break;
167 :
168 : /* we assume we do not need to write the unmodified page */
2280 rhaas 169 UBC 0 : if (retain_pin)
170 : {
171 : /* pin will be retained only for the primary bucket page */
2186 tgl 172 0 : Assert((pageopaque->hasho_flag & LH_PAGE_TYPE) == LH_BUCKET_PAGE);
2298 rhaas 173 0 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
174 : }
175 : else
2321 176 0 : _hash_relbuf(rel, buf);
177 :
2280 178 0 : retain_pin = false;
179 :
5820 tgl 180 0 : buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
181 : }
182 :
183 : /* Get exclusive lock on the meta page */
2298 rhaas 184 CBC 161 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
185 :
6363 tgl 186 161 : _hash_checkpage(rel, metabuf, LH_META_PAGE);
2545 kgrittn 187 161 : metap = HashPageGetMeta(BufferGetPage(metabuf));
188 :
189 : /* start search at hashm_firstfree */
7157 tgl 190 161 : orig_firstfree = metap->hashm_firstfree;
191 161 : first_page = orig_firstfree >> BMPG_SHIFT(metap);
192 161 : bit = orig_firstfree & BMPG_MASK(metap);
193 161 : i = first_page;
7160 194 161 : j = bit / BITS_PER_MAP;
195 161 : bit &= ~(BITS_PER_MAP - 1);
196 :
197 : /* outer loop iterates once per bitmap page */
198 : for (;;)
9345 bruce 199 126 : {
200 : BlockNumber mapblkno;
201 : Page mappage;
202 : uint32 last_inpage;
203 :
204 : /* want to end search with the last existing overflow page */
7157 tgl 205 287 : splitnum = metap->hashm_ovflpoint;
206 287 : max_ovflpg = metap->hashm_spares[splitnum] - 1;
207 287 : last_page = max_ovflpg >> BMPG_SHIFT(metap);
208 287 : last_bit = max_ovflpg & BMPG_MASK(metap);
209 :
210 287 : if (i > last_page)
211 126 : break;
212 :
213 161 : Assert(i < metap->hashm_nmaps);
214 161 : mapblkno = metap->hashm_mapp[i];
215 :
7160 216 161 : if (i == last_page)
217 161 : last_inpage = last_bit;
218 : else
7160 tgl 219 UBC 0 : last_inpage = BMPGSZ_BIT(metap) - 1;
220 :
221 : /* Release exclusive lock on metapage while reading bitmap page */
2298 rhaas 222 CBC 161 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
223 :
5820 tgl 224 161 : mapbuf = _hash_getbuf(rel, mapblkno, HASH_WRITE, LH_BITMAP_PAGE);
2545 kgrittn 225 161 : mappage = BufferGetPage(mapbuf);
7157 tgl 226 161 : freep = HashPageGetBitmap(mappage);
227 :
7160 228 287 : for (; bit <= last_inpage; j++, bit += BITS_PER_MAP)
229 : {
9345 bruce 230 161 : if (freep[j] != ALL_SET)
231 : {
2232 rhaas 232 35 : page_found = true;
233 :
234 : /* Reacquire exclusive lock on the meta page */
235 35 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
236 :
237 : /* convert bit to bit number within page */
238 35 : bit += _hash_firstfreebit(freep[j]);
239 35 : bitmap_page_bit = bit;
240 :
241 : /* convert bit to absolute bit number */
242 35 : bit += (i << BMPG_SHIFT(metap));
243 : /* Calculate address of the recycled overflow page */
244 35 : blkno = bitno_to_blkno(metap, bit);
245 :
246 : /* Fetch and init the recycled page */
247 35 : ovflbuf = _hash_getinitbuf(rel, blkno);
248 :
9345 bruce 249 35 : goto found;
250 : }
251 : }
252 :
253 : /* No free space here, try to advance to next map page */
7157 tgl 254 126 : _hash_relbuf(rel, mapbuf);
2232 rhaas 255 126 : mapbuf = InvalidBuffer;
7157 tgl 256 126 : i++;
257 126 : j = 0; /* scan from start of next map page */
258 126 : bit = 0;
259 :
260 : /* Reacquire exclusive lock on the meta page */
2298 rhaas 261 126 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
262 : }
263 :
264 : /*
265 : * No free pages --- have to extend the relation to add an overflow page.
266 : * First, check to see if we have to add a new bitmap page too.
267 : */
7160 tgl 268 126 : if (last_bit == (uint32) (BMPGSZ_BIT(metap) - 1))
269 : {
270 : /*
271 : * We create the new bitmap page with all pages marked "in use".
272 : * Actually two pages in the new bitmap's range will exist
273 : * immediately: the bitmap page itself, and the following page which
274 : * is the one we return to the caller. Both of these are correctly
275 : * marked "in use". Subsequent pages do not exist yet, but it is
276 : * convenient to pre-mark them as "in use" too.
277 : */
7160 tgl 278 UBC 0 : bit = metap->hashm_spares[splitnum];
279 :
280 : /* metapage already has a write lock */
2232 rhaas 281 0 : if (metap->hashm_nmaps >= HASH_MAX_BITMAPS)
282 0 : ereport(ERROR,
283 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
284 : errmsg("out of overflow pages in hash index \"%s\"",
285 : RelationGetRelationName(rel))));
286 :
287 0 : newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM);
288 : }
289 : else
290 : {
291 : /*
292 : * Nothing to do here; since the page will be past the last used page,
293 : * we know its bitmap bit was preinitialized to "in use".
294 : */
295 : }
296 :
297 : /* Calculate address of the new overflow page */
2232 rhaas 298 CBC 126 : bit = BufferIsValid(newmapbuf) ?
299 126 : metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum];
7160 tgl 300 126 : blkno = bitno_to_blkno(metap, bit);
301 :
302 : /*
303 : * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
304 : * relation length stays in sync with ours. XXX It's annoying to do this
305 : * with metapage write lock held; would be better to use a lock that
306 : * doesn't block incoming searches.
307 : *
308 : * It is okay to hold two buffer locks here (one on tail page of bucket
309 : * and other on new overflow page) since there cannot be anyone else
310 : * contending for access to ovflbuf.
311 : */
2232 rhaas 312 126 : ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM);
313 :
314 161 : found:
315 :
316 : /*
317 : * Do the update. No ereport(ERROR) until changes are logged. We want to
318 : * log the changes for bitmap page and overflow page together to avoid
319 : * loss of pages in case the new page is added.
320 : */
2217 321 161 : START_CRIT_SECTION();
322 :
2232 323 161 : if (page_found)
324 : {
325 35 : Assert(BufferIsValid(mapbuf));
326 :
327 : /* mark page "in use" in the bitmap */
328 35 : SETBIT(freep, bitmap_page_bit);
329 35 : MarkBufferDirty(mapbuf);
330 : }
331 : else
332 : {
333 : /* update the count to indicate new overflow page is added */
334 126 : metap->hashm_spares[splitnum]++;
335 :
336 126 : if (BufferIsValid(newmapbuf))
337 : {
2232 rhaas 338 UBC 0 : _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false);
339 0 : MarkBufferDirty(newmapbuf);
340 :
341 : /* add the new bitmap page to the metapage's list of bitmaps */
342 0 : metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf);
343 0 : metap->hashm_nmaps++;
344 0 : metap->hashm_spares[splitnum]++;
345 : }
346 :
1893 rhaas 347 CBC 126 : MarkBufferDirty(metabuf);
348 :
349 : /*
350 : * for new overflow page, we don't need to explicitly set the bit in
351 : * bitmap page, as by default that will be set to "in use".
352 : */
353 : }
354 :
355 : /*
356 : * Adjust hashm_firstfree to avoid redundant searches. But don't risk
357 : * changing it if someone moved it while we were searching bitmap pages.
358 : */
7157 tgl 359 161 : if (metap->hashm_firstfree == orig_firstfree)
360 : {
361 161 : metap->hashm_firstfree = bit + 1;
2298 rhaas 362 161 : MarkBufferDirty(metabuf);
363 : }
364 :
365 : /* initialize new overflow page */
2232 366 161 : ovflpage = BufferGetPage(ovflbuf);
373 michael 367 161 : ovflopaque = HashPageGetOpaque(ovflpage);
2232 rhaas 368 161 : ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
369 161 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
370 161 : ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
371 161 : ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
372 161 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
373 :
374 161 : MarkBufferDirty(ovflbuf);
375 :
376 : /* logically chain overflow page to previous page */
377 161 : pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf);
378 :
379 161 : MarkBufferDirty(buf);
380 :
381 : /* XLOG stuff */
2217 382 161 : if (RelationNeedsWAL(rel))
383 : {
384 : XLogRecPtr recptr;
385 : xl_hash_add_ovfl_page xlrec;
386 :
387 147 : xlrec.bmpage_found = page_found;
388 147 : xlrec.bmsize = metap->hashm_bmsize;
389 :
390 147 : XLogBeginInsert();
391 147 : XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage);
392 :
393 147 : XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT);
394 147 : XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket));
395 :
396 147 : XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
397 :
398 147 : if (BufferIsValid(mapbuf))
399 : {
400 35 : XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD);
401 35 : XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32));
402 : }
403 :
404 147 : if (BufferIsValid(newmapbuf))
2217 rhaas 405 UBC 0 : XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT);
406 :
2217 rhaas 407 CBC 147 : XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD);
408 147 : XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32));
409 :
410 147 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE);
411 :
412 147 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
413 147 : PageSetLSN(BufferGetPage(buf), recptr);
414 :
415 147 : if (BufferIsValid(mapbuf))
416 35 : PageSetLSN(BufferGetPage(mapbuf), recptr);
417 :
418 147 : if (BufferIsValid(newmapbuf))
2217 rhaas 419 UBC 0 : PageSetLSN(BufferGetPage(newmapbuf), recptr);
420 :
2217 rhaas 421 CBC 147 : PageSetLSN(BufferGetPage(metabuf), recptr);
422 : }
423 :
424 161 : END_CRIT_SECTION();
425 :
2232 426 161 : if (retain_pin)
427 47 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
428 : else
429 114 : _hash_relbuf(rel, buf);
430 :
431 161 : if (BufferIsValid(mapbuf))
432 35 : _hash_relbuf(rel, mapbuf);
433 :
434 161 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
435 :
436 161 : if (BufferIsValid(newmapbuf))
2232 rhaas 437 UBC 0 : _hash_relbuf(rel, newmapbuf);
438 :
2232 rhaas 439 CBC 161 : return ovflbuf;
440 : }
441 :
442 : /*
443 : * _hash_firstfreebit()
444 : *
445 : * Return the number of the first bit that is not set in the word 'map'.
446 : */
447 : static uint32
9770 scrappy 448 35 : _hash_firstfreebit(uint32 map)
449 : {
450 : uint32 i,
451 : mask;
452 :
9345 bruce 453 35 : mask = 0x1;
454 289 : for (i = 0; i < BITS_PER_MAP; i++)
455 : {
456 289 : if (!(mask & map))
8986 457 35 : return i;
7160 tgl 458 254 : mask <<= 1;
459 : }
460 :
7157 tgl 461 UBC 0 : elog(ERROR, "firstfreebit found no free bit");
462 :
463 : return 0; /* keep compiler quiet */
464 : }
465 :
466 : /*
467 : * _hash_freeovflpage() -
468 : *
469 : * Remove this overflow page from its bucket's chain, and mark the page as
470 : * free. On entry, ovflbuf is write-locked; it is released before exiting.
471 : *
472 : * Add the tuples (itups) to wbuf in this function. We could do that in the
473 : * caller as well, but the advantage of doing it here is we can easily write
474 : * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and
475 : * removal of overflow page has to done as an atomic operation, otherwise
476 : * during replay on standby users might find duplicate records.
477 : *
478 : * Since this function is invoked in VACUUM, we provide an access strategy
479 : * parameter that controls fetches of the bucket pages.
480 : *
481 : * Returns the block number of the page that followed the given page
482 : * in the bucket, or InvalidBlockNumber if no following page.
483 : *
484 : * NB: caller must not hold lock on metapage, nor on page, that's next to
485 : * ovflbuf in the bucket chain. We don't acquire the lock on page that's
486 : * prior to ovflbuf in chain if it is same as wbuf because the caller already
487 : * has a lock on same.
488 : */
489 : BlockNumber
2232 rhaas 490 CBC 65 : _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf,
491 : Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets,
492 : Size *tups_size, uint16 nitups,
493 : BufferAccessStrategy bstrategy)
494 : {
495 : HashMetaPage metap;
496 : Buffer metabuf;
497 : Buffer mapbuf;
498 : BlockNumber ovflblkno;
499 : BlockNumber prevblkno;
500 : BlockNumber blkno;
501 : BlockNumber nextblkno;
502 : BlockNumber writeblkno;
503 : HashPageOpaque ovflopaque;
504 : Page ovflpage;
505 : Page mappage;
506 : uint32 *freep;
507 : uint32 ovflbitno;
508 : int32 bitmappage,
509 : bitmapbit;
510 : Bucket bucket PG_USED_FOR_ASSERTS_ONLY;
2217 511 65 : Buffer prevbuf = InvalidBuffer;
512 65 : Buffer nextbuf = InvalidBuffer;
513 65 : bool update_metap = false;
514 :
515 : /* Get information from the doomed page */
6363 tgl 516 65 : _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE);
6797 bruce 517 65 : ovflblkno = BufferGetBlockNumber(ovflbuf);
2545 kgrittn 518 65 : ovflpage = BufferGetPage(ovflbuf);
373 michael 519 65 : ovflopaque = HashPageGetOpaque(ovflpage);
9345 bruce 520 65 : nextblkno = ovflopaque->hasho_nextblkno;
521 65 : prevblkno = ovflopaque->hasho_prevblkno;
2321 rhaas 522 65 : writeblkno = BufferGetBlockNumber(wbuf);
9345 bruce 523 65 : bucket = ovflopaque->hasho_bucket;
524 :
525 : /*
526 : * Fix up the bucket chain. this is a doubly-linked list, so we must fix
527 : * up the bucket chain members behind and ahead of the overflow page being
528 : * deleted. Concurrency issues are avoided by using lock chaining as
529 : * described atop hashbucketcleanup.
530 : */
531 65 : if (BlockNumberIsValid(prevblkno))
532 : {
2321 rhaas 533 65 : if (prevblkno == writeblkno)
534 26 : prevbuf = wbuf;
535 : else
536 39 : prevbuf = _hash_getbuf_with_strategy(rel,
537 : prevblkno,
538 : HASH_WRITE,
539 : LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
540 : bstrategy);
541 : }
9345 bruce 542 65 : if (BlockNumberIsValid(nextblkno))
2232 rhaas 543 UBC 0 : nextbuf = _hash_getbuf_with_strategy(rel,
544 : nextblkno,
545 : HASH_WRITE,
546 : LH_OVERFLOW_PAGE,
547 : bstrategy);
548 :
549 : /* Note: bstrategy is intentionally not used for metapage and bitmap */
550 :
551 : /* Read the metapage so we can determine which bitmap page to use */
5820 tgl 552 CBC 65 : metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
2545 kgrittn 553 65 : metap = HashPageGetMeta(BufferGetPage(metabuf));
554 :
555 : /* Identify which bit to set */
2257 rhaas 556 65 : ovflbitno = _hash_ovflblkno_to_bitno(metap, ovflblkno);
557 :
7160 tgl 558 65 : bitmappage = ovflbitno >> BMPG_SHIFT(metap);
559 65 : bitmapbit = ovflbitno & BMPG_MASK(metap);
560 :
561 65 : if (bitmappage >= metap->hashm_nmaps)
7160 tgl 562 UBC 0 : elog(ERROR, "invalid overflow bit number %u", ovflbitno);
9345 bruce 563 CBC 65 : blkno = metap->hashm_mapp[bitmappage];
564 :
565 : /* Release metapage lock while we access the bitmap page */
2298 rhaas 566 65 : LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
567 :
568 : /* read the bitmap page to clear the bitmap bit */
5820 tgl 569 65 : mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE);
2545 kgrittn 570 65 : mappage = BufferGetPage(mapbuf);
9345 bruce 571 65 : freep = HashPageGetBitmap(mappage);
7157 tgl 572 65 : Assert(ISSET(freep, bitmapbit));
573 :
574 : /* Get write-lock on metapage to update firstfree */
2298 rhaas 575 65 : LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
576 :
577 : /* This operation needs to log multiple tuples, prepare WAL for that */
2217 578 65 : if (RelationNeedsWAL(rel))
579 65 : XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, 4 + nitups);
580 :
581 65 : START_CRIT_SECTION();
582 :
583 : /*
584 : * we have to insert tuples on the "write" page, being careful to preserve
585 : * hashkey ordering. (If we insert many tuples into the same "write" page
586 : * it would be worth qsort'ing them).
587 : */
2232 588 65 : if (nitups > 0)
589 : {
590 29 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
591 29 : MarkBufferDirty(wbuf);
592 : }
593 :
594 : /*
595 : * Reinitialize the freed overflow page. Just zeroing the page won't
596 : * work, because WAL replay routines expect pages to be initialized. See
597 : * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. We are
598 : * careful to make the special space valid here so that tools like
599 : * pageinspect won't get confused.
600 : */
601 65 : _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
602 :
373 michael 603 65 : ovflopaque = HashPageGetOpaque(ovflpage);
604 :
2195 rhaas 605 65 : ovflopaque->hasho_prevblkno = InvalidBlockNumber;
606 65 : ovflopaque->hasho_nextblkno = InvalidBlockNumber;
646 peter 607 65 : ovflopaque->hasho_bucket = InvalidBucket;
2195 rhaas 608 65 : ovflopaque->hasho_flag = LH_UNUSED_PAGE;
609 65 : ovflopaque->hasho_page_id = HASHO_PAGE_ID;
610 :
2232 611 65 : MarkBufferDirty(ovflbuf);
612 :
613 65 : if (BufferIsValid(prevbuf))
614 : {
615 65 : Page prevpage = BufferGetPage(prevbuf);
373 michael 616 65 : HashPageOpaque prevopaque = HashPageGetOpaque(prevpage);
617 :
2232 rhaas 618 65 : Assert(prevopaque->hasho_bucket == bucket);
619 65 : prevopaque->hasho_nextblkno = nextblkno;
620 65 : MarkBufferDirty(prevbuf);
621 : }
622 65 : if (BufferIsValid(nextbuf))
623 : {
2232 rhaas 624 UBC 0 : Page nextpage = BufferGetPage(nextbuf);
373 michael 625 0 : HashPageOpaque nextopaque = HashPageGetOpaque(nextpage);
626 :
2232 rhaas 627 0 : Assert(nextopaque->hasho_bucket == bucket);
628 0 : nextopaque->hasho_prevblkno = prevblkno;
629 0 : MarkBufferDirty(nextbuf);
630 : }
631 :
632 : /* Clear the bitmap bit to indicate that this overflow page is free */
2232 rhaas 633 CBC 65 : CLRBIT(freep, bitmapbit);
634 65 : MarkBufferDirty(mapbuf);
635 :
636 : /* if this is now the first free page, update hashm_firstfree */
7160 tgl 637 65 : if (ovflbitno < metap->hashm_firstfree)
638 : {
639 62 : metap->hashm_firstfree = ovflbitno;
2217 rhaas 640 62 : update_metap = true;
2305 641 62 : MarkBufferDirty(metabuf);
642 : }
643 :
644 : /* XLOG stuff */
2217 645 65 : if (RelationNeedsWAL(rel))
646 : {
647 : xl_hash_squeeze_page xlrec;
648 : XLogRecPtr recptr;
649 : int i;
650 :
651 65 : xlrec.prevblkno = prevblkno;
652 65 : xlrec.nextblkno = nextblkno;
653 65 : xlrec.ntups = nitups;
654 65 : xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf);
655 65 : xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf);
656 :
657 65 : XLogBeginInsert();
658 65 : XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage);
659 :
660 : /*
661 : * bucket buffer needs to be registered to ensure that we can acquire
662 : * a cleanup lock on it during replay.
663 : */
664 65 : if (!xlrec.is_prim_bucket_same_wrt)
665 3 : XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
666 :
667 65 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
668 65 : if (xlrec.ntups > 0)
669 : {
670 29 : XLogRegisterBufData(1, (char *) itup_offsets,
671 : nitups * sizeof(OffsetNumber));
672 1105 : for (i = 0; i < nitups; i++)
673 1076 : XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
674 : }
675 :
676 65 : XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD);
677 :
678 : /*
679 : * If prevpage and the writepage (block in which we are moving tuples
680 : * from overflow) are same, then no need to separately register
681 : * prevpage. During replay, we can directly update the nextblock in
682 : * writepage.
683 : */
684 65 : if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
685 39 : XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD);
686 :
687 65 : if (BufferIsValid(nextbuf))
2217 rhaas 688 UBC 0 : XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD);
689 :
2217 rhaas 690 CBC 65 : XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD);
691 65 : XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32));
692 :
693 65 : if (update_metap)
694 : {
695 62 : XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD);
696 62 : XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32));
697 : }
698 :
699 65 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE);
700 :
701 65 : PageSetLSN(BufferGetPage(wbuf), recptr);
702 65 : PageSetLSN(BufferGetPage(ovflbuf), recptr);
703 :
704 65 : if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt)
705 39 : PageSetLSN(BufferGetPage(prevbuf), recptr);
706 65 : if (BufferIsValid(nextbuf))
2217 rhaas 707 UBC 0 : PageSetLSN(BufferGetPage(nextbuf), recptr);
708 :
2217 rhaas 709 CBC 65 : PageSetLSN(BufferGetPage(mapbuf), recptr);
710 :
711 65 : if (update_metap)
712 62 : PageSetLSN(BufferGetPage(metabuf), recptr);
713 : }
714 :
715 65 : END_CRIT_SECTION();
716 :
717 : /* release previous bucket if it is not same as write bucket */
2232 718 65 : if (BufferIsValid(prevbuf) && prevblkno != writeblkno)
719 39 : _hash_relbuf(rel, prevbuf);
720 :
721 65 : if (BufferIsValid(ovflbuf))
722 65 : _hash_relbuf(rel, ovflbuf);
723 :
724 65 : if (BufferIsValid(nextbuf))
2232 rhaas 725 UBC 0 : _hash_relbuf(rel, nextbuf);
726 :
2232 rhaas 727 CBC 65 : _hash_relbuf(rel, mapbuf);
2305 728 65 : _hash_relbuf(rel, metabuf);
729 :
7160 tgl 730 65 : return nextblkno;
731 : }
732 :
733 :
734 : /*
735 : * _hash_initbitmapbuffer()
736 : *
737 : * Initialize a new bitmap page. All bits in the new bitmap page are set to
738 : * "1", indicating "in use".
739 : */
740 : void
2232 rhaas 741 170 : _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
742 : {
743 : Page pg;
744 : HashPageOpaque op;
745 : uint32 *freep;
746 :
747 170 : pg = BufferGetPage(buf);
748 :
749 : /* initialize the page */
750 170 : if (initpage)
751 21 : _hash_pageinit(pg, BufferGetPageSize(buf));
752 :
753 : /* initialize the page's special space */
373 michael 754 170 : op = HashPageGetOpaque(pg);
2232 rhaas 755 170 : op->hasho_prevblkno = InvalidBlockNumber;
756 170 : op->hasho_nextblkno = InvalidBlockNumber;
646 peter 757 170 : op->hasho_bucket = InvalidBucket;
2232 rhaas 758 170 : op->hasho_flag = LH_BITMAP_PAGE;
759 170 : op->hasho_page_id = HASHO_PAGE_ID;
760 :
761 : /* set all of the bits to 1 */
762 170 : freep = HashPageGetBitmap(pg);
282 peter 763 GNC 170 : memset(freep, 0xFF, bmsize);
764 :
765 : /*
766 : * Set pd_lower just past the end of the bitmap page data. We could even
767 : * set pd_lower equal to pd_upper, but this is more precise and makes the
768 : * page look compressible to xlog.c.
769 : */
2232 rhaas 770 CBC 170 : ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg;
771 170 : }
772 :
773 :
774 : /*
775 : * _hash_squeezebucket(rel, bucket)
776 : *
777 : * Try to squeeze the tuples onto pages occurring earlier in the
778 : * bucket chain in an attempt to free overflow pages. When we start
779 : * the "squeezing", the page from which we start taking tuples (the
780 : * "read" page) is the last bucket in the bucket chain and the page
781 : * onto which we start squeezing tuples (the "write" page) is the
782 : * first page in the bucket chain. The read page works backward and
783 : * the write page works forward; the procedure terminates when the
784 : * read page and write page are the same page.
785 : *
786 : * At completion of this procedure, it is guaranteed that all pages in
787 : * the bucket are nonempty, unless the bucket is totally empty (in
788 : * which case all overflow pages will be freed). The original implementation
789 : * required that to be true on entry as well, but it's a lot easier for
790 : * callers to leave empty overflow pages and let this guy clean it up.
791 : *
792 : * Caller must acquire cleanup lock on the primary page of the target
793 : * bucket to exclude any scans that are in progress, which could easily
794 : * be confused into returning the same tuple more than once or some tuples
795 : * not at all by the rearrangement we are performing here. To prevent
796 : * any concurrent scan to cross the squeeze scan we use lock chaining
797 : * similar to hashbucketcleanup. Refer comments atop hashbucketcleanup.
798 : *
799 : * We need to retain a pin on the primary bucket to ensure that no concurrent
800 : * split can start.
801 : *
802 : * Since this function is invoked in VACUUM, we provide an access strategy
803 : * parameter that controls fetches of the bucket pages.
804 : */
805 : void
9770 scrappy 806 684 : _hash_squeezebucket(Relation rel,
807 : Bucket bucket,
808 : BlockNumber bucket_blkno,
809 : Buffer bucket_buf,
810 : BufferAccessStrategy bstrategy)
811 : {
812 : BlockNumber wblkno;
813 : BlockNumber rblkno;
814 : Buffer wbuf;
815 : Buffer rbuf;
816 : Page wpage;
817 : Page rpage;
818 : HashPageOpaque wopaque;
819 : HashPageOpaque ropaque;
820 :
821 : /*
822 : * start squeezing into the primary bucket page.
823 : */
7159 tgl 824 684 : wblkno = bucket_blkno;
2321 rhaas 825 684 : wbuf = bucket_buf;
2545 kgrittn 826 684 : wpage = BufferGetPage(wbuf);
373 michael 827 684 : wopaque = HashPageGetOpaque(wpage);
828 :
829 : /*
830 : * if there aren't any overflow pages, there's nothing to squeeze. caller
831 : * is responsible for releasing the pin on primary bucket page.
832 : */
9345 bruce 833 684 : if (!BlockNumberIsValid(wopaque->hasho_nextblkno))
834 : {
2298 rhaas 835 655 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
9770 scrappy 836 655 : return;
837 : }
838 :
839 : /*
840 : * Find the last page in the bucket chain by starting at the base bucket
841 : * page and working forward. Note: we assume that a hash bucket chain is
842 : * usually smaller than the buffer ring being used by VACUUM, else using
843 : * the access strategy here would be counterproductive.
844 : */
4907 tgl 845 29 : rbuf = InvalidBuffer;
9345 bruce 846 29 : ropaque = wopaque;
847 : do
848 : {
849 143 : rblkno = ropaque->hasho_nextblkno;
4907 tgl 850 143 : if (rbuf != InvalidBuffer)
7157 851 114 : _hash_relbuf(rel, rbuf);
5793 852 143 : rbuf = _hash_getbuf_with_strategy(rel,
853 : rblkno,
854 : HASH_WRITE,
855 : LH_OVERFLOW_PAGE,
856 : bstrategy);
2545 kgrittn 857 143 : rpage = BufferGetPage(rbuf);
373 michael 858 143 : ropaque = HashPageGetOpaque(rpage);
9345 bruce 859 143 : Assert(ropaque->hasho_bucket == bucket);
860 143 : } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
861 :
862 : /*
863 : * squeeze the tuples.
864 : */
865 : for (;;)
866 39 : {
867 : OffsetNumber roffnum;
868 : OffsetNumber maxroffnum;
869 : OffsetNumber deletable[MaxOffsetNumber];
870 : IndexTuple itups[MaxIndexTuplesPerPage];
871 : Size tups_size[MaxIndexTuplesPerPage];
872 : OffsetNumber itup_offsets[MaxIndexTuplesPerPage];
2232 rhaas 873 68 : uint16 ndeletable = 0;
874 68 : uint16 nitups = 0;
875 68 : Size all_tups_size = 0;
876 : int i;
2321 877 68 : bool retain_pin = false;
878 :
2232 879 68 : readpage:
880 : /* Scan each tuple in "read" page */
4907 tgl 881 68 : maxroffnum = PageGetMaxOffsetNumber(rpage);
882 68 : for (roffnum = FirstOffsetNumber;
883 1144 : roffnum <= maxroffnum;
884 1076 : roffnum = OffsetNumberNext(roffnum))
885 : {
886 : IndexTuple itup;
887 : Size itemsz;
888 :
889 : /* skip dead tuples */
2343 rhaas 890 1079 : if (ItemIdIsDead(PageGetItemId(rpage, roffnum)))
2343 rhaas 891 UBC 0 : continue;
892 :
6283 tgl 893 CBC 1079 : itup = (IndexTuple) PageGetItem(rpage,
894 : PageGetItemId(rpage, roffnum));
1866 895 1079 : itemsz = IndexTupleSize(itup);
7157 896 1079 : itemsz = MAXALIGN(itemsz);
897 :
898 : /*
899 : * Walk up the bucket chain, looking for a page big enough for
900 : * this item and all other accumulated items. Exit if we reach
901 : * the read page.
902 : */
2232 rhaas 903 1154 : while (PageGetFreeSpaceForMultipleTuples(wpage, nitups + 1) < (all_tups_size + itemsz))
904 : {
2321 905 78 : Buffer next_wbuf = InvalidBuffer;
2232 906 78 : bool tups_moved = false;
907 :
7157 tgl 908 78 : Assert(!PageIsEmpty(wpage));
909 :
2321 rhaas 910 78 : if (wblkno == bucket_blkno)
911 6 : retain_pin = true;
912 :
7157 tgl 913 78 : wblkno = wopaque->hasho_nextblkno;
914 78 : Assert(BlockNumberIsValid(wblkno));
915 :
916 : /* don't need to move to next page if we reached the read page */
2321 rhaas 917 78 : if (wblkno != rblkno)
918 75 : next_wbuf = _hash_getbuf_with_strategy(rel,
919 : wblkno,
920 : HASH_WRITE,
921 : LH_OVERFLOW_PAGE,
922 : bstrategy);
923 :
2232 924 78 : if (nitups > 0)
925 : {
2232 rhaas 926 UBC 0 : Assert(nitups == ndeletable);
927 :
928 : /*
929 : * This operation needs to log multiple tuples, prepare
930 : * WAL for that.
931 : */
2217 932 0 : if (RelationNeedsWAL(rel))
933 0 : XLogEnsureRecordSpace(0, 3 + nitups);
934 :
935 0 : START_CRIT_SECTION();
936 :
937 : /*
938 : * we have to insert tuples on the "write" page, being
939 : * careful to preserve hashkey ordering. (If we insert
940 : * many tuples into the same "write" page it would be
941 : * worth qsort'ing them).
942 : */
2232 943 0 : _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups);
944 0 : MarkBufferDirty(wbuf);
945 :
946 : /* Delete tuples we already moved off read page */
947 0 : PageIndexMultiDelete(rpage, deletable, ndeletable);
948 0 : MarkBufferDirty(rbuf);
949 :
950 : /* XLOG stuff */
2217 951 0 : if (RelationNeedsWAL(rel))
952 : {
953 : XLogRecPtr recptr;
954 : xl_hash_move_page_contents xlrec;
955 :
956 0 : xlrec.ntups = nitups;
578 michael 957 0 : xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf);
958 :
2217 rhaas 959 0 : XLogBeginInsert();
960 0 : XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents);
961 :
962 : /*
963 : * bucket buffer needs to be registered to ensure that
964 : * we can acquire a cleanup lock on it during replay.
965 : */
966 0 : if (!xlrec.is_prim_bucket_same_wrt)
967 0 : XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
968 :
969 0 : XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD);
970 0 : XLogRegisterBufData(1, (char *) itup_offsets,
971 : nitups * sizeof(OffsetNumber));
972 0 : for (i = 0; i < nitups; i++)
973 0 : XLogRegisterBufData(1, (char *) itups[i], tups_size[i]);
974 :
975 0 : XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
976 0 : XLogRegisterBufData(2, (char *) deletable,
977 : ndeletable * sizeof(OffsetNumber));
978 :
979 0 : recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS);
980 :
981 0 : PageSetLSN(BufferGetPage(wbuf), recptr);
982 0 : PageSetLSN(BufferGetPage(rbuf), recptr);
983 : }
984 :
985 0 : END_CRIT_SECTION();
986 :
2232 987 0 : tups_moved = true;
988 : }
989 :
990 : /*
991 : * release the lock on previous page after acquiring the lock
992 : * on next page
993 : */
2305 rhaas 994 CBC 78 : if (retain_pin)
2298 995 6 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
996 : else
4907 tgl 997 72 : _hash_relbuf(rel, wbuf);
998 :
999 : /* nothing more to do if we reached the read page */
7157 1000 78 : if (rblkno == wblkno)
1001 : {
2305 rhaas 1002 3 : _hash_relbuf(rel, rbuf);
7157 tgl 1003 29 : return;
1004 : }
1005 :
2321 rhaas 1006 75 : wbuf = next_wbuf;
2545 kgrittn 1007 75 : wpage = BufferGetPage(wbuf);
373 michael 1008 75 : wopaque = HashPageGetOpaque(wpage);
7157 tgl 1009 75 : Assert(wopaque->hasho_bucket == bucket);
2321 rhaas 1010 75 : retain_pin = false;
1011 :
1012 : /* be tidy */
2232 1013 75 : for (i = 0; i < nitups; i++)
2232 rhaas 1014 UBC 0 : pfree(itups[i]);
2232 rhaas 1015 CBC 75 : nitups = 0;
1016 75 : all_tups_size = 0;
1017 75 : ndeletable = 0;
1018 :
1019 : /*
1020 : * after moving the tuples, rpage would have been compacted,
1021 : * so we need to rescan it.
1022 : */
1023 75 : if (tups_moved)
2232 rhaas 1024 UBC 0 : goto readpage;
1025 : }
1026 :
1027 : /* remember tuple for deletion from "read" page */
4907 tgl 1028 CBC 1076 : deletable[ndeletable++] = roffnum;
1029 :
1030 : /*
1031 : * we need a copy of index tuples as they can be freed as part of
1032 : * overflow page, however we need them to write a WAL record in
1033 : * _hash_freeovflpage.
1034 : */
2232 rhaas 1035 1076 : itups[nitups] = CopyIndexTuple(itup);
1036 1076 : tups_size[nitups++] = itemsz;
1037 1076 : all_tups_size += itemsz;
1038 : }
1039 :
1040 : /*
1041 : * If we reach here, there are no live tuples on the "read" page ---
1042 : * it was empty when we got to it, or we moved them all. So we can
1043 : * just free the page without bothering with deleting tuples
1044 : * individually. Then advance to the previous "read" page.
1045 : *
1046 : * Tricky point here: if our read and write pages are adjacent in the
1047 : * bucket chain, our write lock on wbuf will conflict with
1048 : * _hash_freeovflpage's attempt to update the sibling links of the
1049 : * removed page. In that case, we don't need to lock it again.
1050 : */
4907 tgl 1051 65 : rblkno = ropaque->hasho_prevblkno;
1052 65 : Assert(BlockNumberIsValid(rblkno));
1053 :
1054 : /* free this overflow page (releases rbuf) */
2232 rhaas 1055 65 : _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets,
1056 : tups_size, nitups, bstrategy);
1057 :
1058 : /* be tidy */
1059 1141 : for (i = 0; i < nitups; i++)
1060 1076 : pfree(itups[i]);
1061 :
1062 : /* are we freeing the page adjacent to wbuf? */
4907 tgl 1063 65 : if (rblkno == wblkno)
1064 : {
1065 : /* retain the pin on primary bucket page till end of bucket scan */
2302 rhaas 1066 26 : if (wblkno == bucket_blkno)
2298 1067 23 : LockBuffer(wbuf, BUFFER_LOCK_UNLOCK);
1068 : else
2302 1069 3 : _hash_relbuf(rel, wbuf);
4907 tgl 1070 26 : return;
1071 : }
1072 :
1073 39 : rbuf = _hash_getbuf_with_strategy(rel,
1074 : rblkno,
1075 : HASH_WRITE,
1076 : LH_OVERFLOW_PAGE,
1077 : bstrategy);
2545 kgrittn 1078 39 : rpage = BufferGetPage(rbuf);
373 michael 1079 39 : ropaque = HashPageGetOpaque(rpage);
4907 tgl 1080 39 : Assert(ropaque->hasho_bucket == bucket);
1081 : }
1082 :
1083 : /* NOTREACHED */
1084 : }
|