Age Owner TLA Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * freespace.c
4 : * POSTGRES free space map for quickly finding free space in relations
5 : *
6 : *
7 : * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : * IDENTIFICATION
11 : * src/backend/storage/freespace/freespace.c
12 : *
13 : *
14 : * NOTES:
15 : *
16 : * Free Space Map keeps track of the amount of free space on pages, and
17 : * allows quickly searching for a page with enough free space. The FSM is
18 : * stored in a dedicated relation fork of all heap relations, and those
19 : * index access methods that need it (see also indexfsm.c). See README for
20 : * more information.
21 : *
22 : *-------------------------------------------------------------------------
23 : */
24 : #include "postgres.h"
25 :
26 : #include "access/htup_details.h"
27 : #include "access/xloginsert.h"
28 : #include "access/xlogutils.h"
29 : #include "miscadmin.h"
30 : #include "storage/freespace.h"
31 : #include "storage/fsm_internals.h"
32 : #include "storage/lmgr.h"
33 : #include "storage/smgr.h"
34 :
35 :
36 : /*
37 : * We use just one byte to store the amount of free space on a page, so we
38 : * divide the amount of free space a page can have into 256 different
39 : * categories. The highest category, 255, represents a page with at least
40 : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : * MaxFSMRequestSize, exclusive.
43 : *
44 : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : * categories look like this:
47 : *
48 : *
49 : * Range Category
50 : * 0 - 31 0
51 : * 32 - 63 1
52 : * ... ... ...
53 : * 8096 - 8127 253
54 : * 8128 - 8163 254
55 : * 8164 - 8192 255
56 : *
57 : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : * completely empty page, that would mean that we could never satisfy a
62 : * request of exactly MaxFSMRequestSize bytes.
63 : */
64 : #define FSM_CATEGORIES 256
65 : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : #define MaxFSMRequestSize MaxHeapTupleSize
67 :
68 : /*
69 : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : * with a 3-level tree, and 512 is the smallest we support.
74 : */
75 : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 :
77 : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : #define FSM_BOTTOM_LEVEL 0
79 :
80 : /*
81 : * The internal FSM routines work on a logical addressing scheme. Each
82 : * level of the tree can be thought of as a separately addressable file.
83 : */
84 : typedef struct
85 : {
86 : int level; /* level */
87 : int logpageno; /* page number within the level */
88 : } FSMAddress;
89 :
90 : /* Address of the root page. */
91 : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 :
93 : /* functions to navigate the tree */
94 : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 :
100 : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 :
103 : /* functions to convert amount of free space to a FSM category */
104 : static uint8 fsm_space_avail_to_cat(Size avail);
105 : static uint8 fsm_space_needed_to_cat(Size needed);
106 : static Size fsm_space_cat_to_avail(uint8 cat);
107 :
108 : /* workhorse functions for various operations */
109 : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : uint8 newValue, uint8 minValue);
111 : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : BlockNumber start, BlockNumber end,
114 : bool *eof_p);
115 :
116 :
117 : /******** Public API ********/
118 :
119 : /*
120 : * GetPageWithFreeSpace - try to find a page in the given relation with
121 : * at least the specified amount of free space.
122 : *
123 : * If successful, return the block number; if not, return InvalidBlockNumber.
124 : *
125 : * The caller must be prepared for the possibility that the returned page
126 : * will turn out to have too little space available by the time the caller
127 : * gets a lock on it. In that case, the caller should report the actual
128 : * amount of free space available on that page and then try again (see
129 : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
130 : * extend the relation.
131 : */
132 : BlockNumber
1433 akapila 133 CBC 177585 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
134 : {
5050 bruce 135 177585 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
136 :
1433 akapila 137 177585 : return fsm_search(rel, min_cat);
138 : }
139 :
140 : /*
141 : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
142 : *
143 : * We provide this combo form to save some locking overhead, compared to
144 : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
145 : * also some effort to return a page close to the old page; if there's a
146 : * page with enough free space on the same FSM page where the old one page
147 : * is located, it is preferred.
148 : */
149 : BlockNumber
5304 heikki.linnakangas 150 209060 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
151 : Size oldSpaceAvail, Size spaceNeeded)
152 : {
1433 akapila 153 209060 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
154 209060 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
155 : FSMAddress addr;
156 : uint16 slot;
157 : int search_slot;
158 :
159 : /* Get the location of the FSM byte representing the heap block */
5304 heikki.linnakangas 160 209060 : addr = fsm_get_location(oldPage, &slot);
161 :
162 209060 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
163 :
164 : /*
165 : * If fsm_set_and_search found a suitable new block, return that.
166 : * Otherwise, search as usual.
167 : */
168 209060 : if (search_slot != -1)
169 26430 : return fsm_get_heap_blk(addr, search_slot);
170 : else
171 182630 : return fsm_search(rel, search_cat);
172 : }
173 :
174 : /*
175 : * RecordPageWithFreeSpace - update info about a page.
176 : *
177 : * Note that if the new spaceAvail value is higher than the old value stored
178 : * in the FSM, the space might not become visible to searchers until the next
179 : * FreeSpaceMapVacuum call, which updates the upper level pages.
180 : */
181 : void
1433 akapila 182 177963 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
183 : {
184 177963 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
185 : FSMAddress addr;
186 : uint16 slot;
187 :
188 : /* Get the location of the FSM byte representing the heap block */
5304 heikki.linnakangas 189 177963 : addr = fsm_get_location(heapBlk, &slot);
190 :
191 177963 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
7341 tgl 192 177963 : }
193 :
194 : /*
195 : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
196 : * WAL replay
197 : */
198 : void
277 rhaas 199 GNC 271448 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
200 : Size spaceAvail)
201 : {
5273 heikki.linnakangas 202 CBC 271448 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
203 : FSMAddress addr;
204 : uint16 slot;
205 : BlockNumber blkno;
206 : Buffer buf;
207 : Page page;
208 :
209 : /* Get the location of the FSM byte representing the heap block */
210 271448 : addr = fsm_get_location(heapBlk, &slot);
211 271448 : blkno = fsm_logical_to_physical(addr);
212 :
213 : /* If the page doesn't exist already, extend */
277 rhaas 214 GNC 271448 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
215 : RBM_ZERO_ON_ERROR, InvalidBuffer);
5192 heikki.linnakangas 216 CBC 271448 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
217 :
2545 kgrittn 218 271448 : page = BufferGetPage(buf);
5273 heikki.linnakangas 219 271448 : if (PageIsNew(page))
220 408 : PageInit(page, BLCKSZ, 0);
221 :
222 271448 : if (fsm_set_avail(page, slot, new_cat))
3583 jdavis 223 267733 : MarkBufferDirtyHint(buf, false);
5273 heikki.linnakangas 224 271448 : UnlockReleaseBuffer(buf);
225 271448 : }
226 :
227 : /*
228 : * GetRecordedFreeSpace - return the amount of free space on a particular page,
229 : * according to the FSM.
230 : */
231 : Size
5304 232 45 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
233 : {
234 : FSMAddress addr;
235 : uint16 slot;
236 : Buffer buf;
237 : uint8 cat;
238 :
239 : /* Get the location of the FSM byte representing the heap block */
240 45 : addr = fsm_get_location(heapBlk, &slot);
241 :
242 45 : buf = fsm_readbuf(rel, addr, false);
243 45 : if (!BufferIsValid(buf))
244 35 : return 0;
2545 kgrittn 245 10 : cat = fsm_get_avail(BufferGetPage(buf), slot);
5304 heikki.linnakangas 246 10 : ReleaseBuffer(buf);
247 :
248 10 : return fsm_space_cat_to_avail(cat);
249 : }
250 :
251 : /*
252 : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
253 : *
254 : * nblocks is the new size of the heap.
255 : *
256 : * Return the number of blocks of new FSM.
257 : * If it's InvalidBlockNumber, there is nothing to truncate;
258 : * otherwise the caller is responsible for calling smgrtruncate()
259 : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
260 : * to update upper-level pages in the FSM.
261 : */
262 : BlockNumber
1293 fujii 263 116 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
264 : {
265 : BlockNumber new_nfsmblocks;
266 : FSMAddress first_removed_address;
267 : uint16 first_removed_slot;
268 : Buffer buf;
269 :
270 : /*
271 : * If no FSM has been created yet for this relation, there's nothing to
272 : * truncate.
273 : */
636 tgl 274 116 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
1293 fujii 275 UBC 0 : return InvalidBlockNumber;
276 :
277 : /* Get the location in the FSM of the first removed heap block */
5304 heikki.linnakangas 278 CBC 116 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
279 :
280 : /*
281 : * Zero out the tail of the last remaining FSM page. If the slot
282 : * representing the first removed heap block is at a page boundary, as the
283 : * first slot on the FSM page that first_removed_address points to, we can
284 : * just truncate that page altogether.
285 : */
286 116 : if (first_removed_slot > 0)
287 : {
288 58 : buf = fsm_readbuf(rel, first_removed_address, false);
289 58 : if (!BufferIsValid(buf))
1060 tgl 290 UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
291 : * smaller */
5304 heikki.linnakangas 292 CBC 58 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
293 :
294 : /* NO EREPORT(ERROR) from here till changes are logged */
2363 295 58 : START_CRIT_SECTION();
296 :
2545 kgrittn 297 58 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
298 :
299 : /*
300 : * Truncation of a relation is WAL-logged at a higher-level, and we
301 : * will be called at WAL replay. But if checksums are enabled, we need
302 : * to still write a WAL record to protect against a torn page, if the
303 : * page is flushed to disk before the truncation WAL record. We cannot
304 : * use MarkBufferDirtyHint here, because that will not dirty the page
305 : * during recovery.
306 : */
2363 heikki.linnakangas 307 58 : MarkBufferDirty(buf);
308 58 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
309 15 : log_newpage_buffer(buf, false);
310 :
311 58 : END_CRIT_SECTION();
312 :
5304 313 58 : UnlockReleaseBuffer(buf);
314 :
315 58 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
316 : }
317 : else
318 : {
319 58 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
636 tgl 320 58 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
1060 tgl 321 UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
322 : * smaller */
323 : }
324 :
1293 fujii 325 CBC 116 : return new_nfsmblocks;
326 : }
327 :
328 : /*
329 : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
330 : *
331 : * We assume that the bottom-level pages have already been updated with
332 : * new free-space information.
333 : */
334 : void
5304 heikki.linnakangas 335 132 : FreeSpaceMapVacuum(Relation rel)
336 : {
337 : bool dummy;
338 :
339 : /* Recursively scan the tree, starting at the root */
1837 tgl 340 132 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
341 : (BlockNumber) 0, InvalidBlockNumber,
342 : &dummy);
343 132 : }
344 :
345 : /*
346 : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
347 : *
348 : * As above, but assume that only heap pages between start and end-1 inclusive
349 : * have new free-space information, so update only the upper-level slots
350 : * covering that block range. end == InvalidBlockNumber is equivalent to
351 : * "all the rest of the relation".
352 : */
353 : void
354 15454 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
355 : {
356 : bool dummy;
357 :
358 : /* Recursively scan the tree, starting at the root */
359 15454 : if (end > start)
360 15343 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
7951 361 15454 : }
362 :
363 : /******** Internal routines ********/
364 :
365 : /*
366 : * Return category corresponding x bytes of free space
367 : */
368 : static uint8
5304 heikki.linnakangas 369 658471 : fsm_space_avail_to_cat(Size avail)
370 : {
371 : int cat;
372 :
373 658471 : Assert(avail < BLCKSZ);
374 :
375 658471 : if (avail >= MaxFSMRequestSize)
376 13013 : return 255;
377 :
378 645458 : cat = avail / FSM_CAT_STEP;
379 :
380 : /*
381 : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
382 : * more.
383 : */
384 645458 : if (cat > 254)
5304 heikki.linnakangas 385 UBC 0 : cat = 254;
386 :
5304 heikki.linnakangas 387 CBC 645458 : return (uint8) cat;
388 : }
389 :
390 : /*
391 : * Return the lower bound of the range of free space represented by given
392 : * category.
393 : */
394 : static Size
395 10 : fsm_space_cat_to_avail(uint8 cat)
396 : {
397 : /* The highest category represents exactly MaxFSMRequestSize bytes. */
398 10 : if (cat == 255)
5304 heikki.linnakangas 399 UBC 0 : return MaxFSMRequestSize;
400 : else
5304 heikki.linnakangas 401 CBC 10 : return cat * FSM_CAT_STEP;
402 : }
403 :
404 : /*
405 : * Which category does a page need to have, to accommodate x bytes of data?
406 : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
407 : */
408 : static uint8
409 386645 : fsm_space_needed_to_cat(Size needed)
410 : {
411 : int cat;
412 :
413 : /* Can't ask for more space than the highest category represents */
414 386645 : if (needed > MaxFSMRequestSize)
3363 tgl 415 UBC 0 : elog(ERROR, "invalid FSM request size %zu", needed);
416 :
5304 heikki.linnakangas 417 CBC 386645 : if (needed == 0)
5304 heikki.linnakangas 418 UBC 0 : return 1;
419 :
5304 heikki.linnakangas 420 CBC 386645 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
421 :
422 386645 : if (cat > 255)
5304 heikki.linnakangas 423 UBC 0 : cat = 255;
424 :
5304 heikki.linnakangas 425 CBC 386645 : return (uint8) cat;
426 : }
427 :
428 : /*
429 : * Returns the physical block number of a FSM page
430 : */
431 : static BlockNumber
432 1091767 : fsm_logical_to_physical(FSMAddress addr)
433 : {
434 : BlockNumber pages;
435 : int leafno;
436 : int l;
437 :
438 : /*
439 : * Calculate the logical page number of the first leaf page below the
440 : * given page.
441 : */
442 1091767 : leafno = addr.logpageno;
443 1876099 : for (l = 0; l < addr.level; l++)
444 784332 : leafno *= SlotsPerFSMPage;
445 :
446 : /* Count upper level nodes required to address the leaf page */
447 1091767 : pages = 0;
448 4367068 : for (l = 0; l < FSM_TREE_DEPTH; l++)
449 : {
450 3275301 : pages += leafno + 1;
451 3275301 : leafno /= SlotsPerFSMPage;
452 : }
453 :
454 : /*
455 : * If the page we were asked for wasn't at the bottom level, subtract the
456 : * additional lower level pages we counted above.
457 : */
458 1091767 : pages -= addr.level;
459 :
460 : /* Turn the page count into 0-based block number */
461 1091767 : return pages - 1;
462 : }
463 :
464 : /*
465 : * Return the FSM location corresponding to given heap block.
466 : */
467 : static FSMAddress
468 720456 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
469 : {
470 : FSMAddress addr;
471 :
472 720456 : addr.level = FSM_BOTTOM_LEVEL;
473 720456 : addr.logpageno = heapblk / SlotsPerFSMPage;
474 720456 : *slot = heapblk % SlotsPerFSMPage;
475 :
476 720456 : return addr;
477 : }
478 :
479 : /*
480 : * Return the heap block number corresponding to given location in the FSM.
481 : */
482 : static BlockNumber
483 36776 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
484 : {
485 36776 : Assert(addr.level == FSM_BOTTOM_LEVEL);
486 36776 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
487 : }
488 :
489 : /*
490 : * Given a logical address of a child page, get the logical page number of
491 : * the parent, and the slot within the parent corresponding to the child.
492 : */
493 : static FSMAddress
494 94274 : fsm_get_parent(FSMAddress child, uint16 *slot)
495 : {
496 : FSMAddress parent;
497 :
498 94274 : Assert(child.level < FSM_ROOT_LEVEL);
499 :
500 94274 : parent.level = child.level + 1;
501 94274 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
502 94274 : *slot = child.logpageno % SlotsPerFSMPage;
503 :
504 94274 : return parent;
505 : }
506 :
507 : /*
508 : * Given a logical address of a parent page and a slot number, get the
509 : * logical address of the corresponding child page.
510 : */
511 : static FSMAddress
512 54311 : fsm_get_child(FSMAddress parent, uint16 slot)
513 : {
514 : FSMAddress child;
515 :
516 54311 : Assert(parent.level > FSM_BOTTOM_LEVEL);
517 :
518 54311 : child.level = parent.level - 1;
519 54311 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
520 :
521 54311 : return child;
522 : }
523 :
524 : /*
525 : * Read a FSM page.
526 : *
527 : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
528 : * true, the FSM file is extended.
529 : */
530 : static Buffer
531 820203 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
532 : {
533 820203 : BlockNumber blkno = fsm_logical_to_physical(addr);
534 : Buffer buf;
535 : SMgrRelation reln;
536 :
537 : /*
538 : * Caution: re-using this smgr pointer could fail if the relcache entry
539 : * gets closed. It's safe as long as we only do smgr-level operations
540 : * between here and the last use of the pointer.
541 : */
636 tgl 542 820203 : reln = RelationGetSmgr(rel);
543 :
544 : /*
545 : * If we haven't cached the size of the FSM yet, check it first. Also
546 : * recheck if the requested block seems to be past end, since our cached
547 : * value might be stale. (We send smgr inval messages on truncation, but
548 : * not on extension.)
549 : */
550 820203 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
551 669137 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
552 : {
553 : /* Invalidate the cache so smgrnblocks asks the kernel. */
554 199913 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
555 199913 : if (smgrexists(reln, FSM_FORKNUM))
556 101414 : smgrnblocks(reln, FSM_FORKNUM);
557 : else
558 98499 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
559 : }
560 :
561 : /*
562 : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
563 : * necessary. The FSM information is not accurate anyway, so it's better
564 : * to clear corrupt pages than error out. Since the FSM changes are not
565 : * WAL-logged, the so-called torn page problem on crash can lead to pages
566 : * with corrupt headers, for example.
567 : *
568 : * We use the same path below to initialize pages when extending the
569 : * relation, as a concurrent extension can end up with vm_extend()
570 : * returning an already-initialized page.
571 : */
636 tgl 572 GIC 820203 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
573 : {
5304 heikki.linnakangas 574 98967 : if (extend)
4 andres 575 GNC 14421 : buf = fsm_extend(rel, blkno + 1);
576 : else
5304 heikki.linnakangas 577 GIC 84546 : return InvalidBuffer;
578 : }
579 : else
4 andres 580 GNC 721236 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
581 :
582 : /*
583 : * Initializing the page when needed is trickier than it looks, because of
584 : * the possibility of multiple backends doing this concurrently, and our
585 : * desire to not uselessly take the buffer lock in the normal path where
1731 tgl 586 ECB : * the page is OK. We must take the lock to initialize the page, so
587 : * recheck page newness after we have the lock, in case someone else
588 : * already did it. Also, because we initially check PageIsNew with no
589 : * lock, it's possible to fall through and return the buffer while someone
590 : * else is still initializing the page (i.e., we might see pd_upper as set
591 : * but other page header fields are still zeroes). This is harmless for
592 : * callers that will take a buffer lock themselves, but some callers
593 : * inspect the page without any lock at all. The latter is OK only so
594 : * long as it doesn't depend on the page header having correct contents.
595 : * Current usage is safe because PageGetContents() does not require that.
596 : */
2545 kgrittn 597 GIC 735657 : if (PageIsNew(BufferGetPage(buf)))
598 : {
1731 tgl 599 49080 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
600 49080 : if (PageIsNew(BufferGetPage(buf)))
601 49080 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
1731 tgl 602 CBC 49080 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
603 : }
5273 heikki.linnakangas 604 735657 : return buf;
7951 tgl 605 ECB : }
606 :
607 : /*
608 : * Ensure that the FSM fork is at least fsm_nblocks long, extending
5304 heikki.linnakangas 609 : * it if necessary with empty pages. And by empty, I mean pages filled
610 : * with zeros, meaning there's no free space.
611 : */
612 : static Buffer
5247 heikki.linnakangas 613 GIC 14421 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
614 : {
4 andres 615 GNC 14421 : return ExtendBufferedRelTo(EB_REL(rel), FSM_FORKNUM, NULL,
616 : EB_CREATE_FORK_IF_NEEDED |
617 : EB_CLEAR_SIZE_CACHE,
618 : fsm_nblocks,
619 : RBM_ZERO_ON_ERROR);
620 : }
621 :
7951 tgl 622 ECB : /*
623 : * Set value in given FSM page and slot.
624 : *
5304 heikki.linnakangas 625 : * If minValue > 0, the updated page is also searched for a page with at
626 : * least minValue of free space. If one is found, its slot number is
627 : * returned, -1 otherwise.
7951 tgl 628 : */
629 : static int
5304 heikki.linnakangas 630 GIC 388561 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
5304 heikki.linnakangas 631 ECB : uint8 newValue, uint8 minValue)
632 : {
633 : Buffer buf;
634 : Page page;
5304 heikki.linnakangas 635 GIC 388561 : int newslot = -1;
636 :
5304 heikki.linnakangas 637 CBC 388561 : buf = fsm_readbuf(rel, addr, true);
5304 heikki.linnakangas 638 GIC 388561 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
7341 tgl 639 ECB :
2545 kgrittn 640 CBC 388561 : page = BufferGetPage(buf);
7341 tgl 641 ECB :
5304 heikki.linnakangas 642 GIC 388561 : if (fsm_set_avail(page, slot, newValue))
3583 jdavis 643 CBC 184486 : MarkBufferDirtyHint(buf, false);
5304 heikki.linnakangas 644 ECB :
5304 heikki.linnakangas 645 CBC 388561 : if (minValue != 0)
646 : {
647 : /* Search while we still hold the lock */
648 209060 : newslot = fsm_search_avail(buf, minValue,
5304 heikki.linnakangas 649 GIC 209060 : addr.level == FSM_BOTTOM_LEVEL,
5304 heikki.linnakangas 650 ECB : true);
651 : }
652 :
5304 heikki.linnakangas 653 GIC 388561 : UnlockReleaseBuffer(buf);
654 :
655 388561 : return newslot;
7951 tgl 656 ECB : }
657 :
658 : /*
5304 heikki.linnakangas 659 : * Search the tree for a heap page with at least min_cat of free space
660 : */
661 : static BlockNumber
5304 heikki.linnakangas 662 GIC 360215 : fsm_search(Relation rel, uint8 min_cat)
663 : {
5050 bruce 664 360215 : int restarts = 0;
665 360215 : FSMAddress addr = FSM_ROOT_ADDRESS;
666 :
5304 heikki.linnakangas 667 ECB : for (;;)
7951 tgl 668 GIC 24537 : {
669 : int slot;
670 : Buffer buf;
5050 bruce 671 384752 : uint8 max_avail = 0;
672 :
673 : /* Read the FSM page. */
5246 heikki.linnakangas 674 384752 : buf = fsm_readbuf(rel, addr, false);
675 :
676 : /* Search within the page */
5304 677 384752 : if (BufferIsValid(buf))
678 : {
679 300718 : LockBuffer(buf, BUFFER_LOCK_SHARE);
680 300718 : slot = fsm_search_avail(buf, min_cat,
681 300718 : (addr.level == FSM_BOTTOM_LEVEL),
682 : false);
683 300718 : if (slot == -1)
2545 kgrittn 684 267373 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
5304 heikki.linnakangas 685 300718 : UnlockReleaseBuffer(buf);
5304 heikki.linnakangas 686 ECB : }
687 : else
5304 heikki.linnakangas 688 GIC 84034 : slot = -1;
689 :
690 384752 : if (slot != -1)
691 : {
692 : /*
693 : * Descend the tree, or return the found block if we're at the
694 : * bottom.
695 : */
5304 heikki.linnakangas 696 CBC 33345 : if (addr.level == FSM_BOTTOM_LEVEL)
5304 heikki.linnakangas 697 GBC 10346 : return fsm_get_heap_blk(addr, slot);
698 :
5304 heikki.linnakangas 699 GIC 22999 : addr = fsm_get_child(addr, slot);
7951 tgl 700 ECB : }
5304 heikki.linnakangas 701 GIC 351407 : else if (addr.level == FSM_ROOT_LEVEL)
702 : {
703 : /*
704 : * At the root, failure means there's no page with enough free
705 : * space in the FSM. Give up.
706 : */
707 349869 : return InvalidBlockNumber;
708 : }
709 : else
710 : {
711 : uint16 parentslot;
712 : FSMAddress parent;
713 :
714 : /*
715 : * At lower level, failure can happen if the value in the upper-
716 : * level node didn't reflect the value on the lower page. Update
717 : * the upper node, to avoid falling into the same trap again, and
718 : * start over.
719 : *
5304 heikki.linnakangas 720 ECB : * There's a race condition here, if another backend updates this
721 : * page right after we release it, and gets the lock on the parent
722 : * page before us. We'll then update the parent page with the now
723 : * stale information we had. It's OK, because it should happen
724 : * rarely, and will be fixed by the next vacuum.
725 : */
5304 heikki.linnakangas 726 GIC 1538 : parent = fsm_get_parent(addr, &parentslot);
727 1538 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
728 :
5304 heikki.linnakangas 729 ECB : /*
5050 bruce 730 : * If the upper pages are badly out of date, we might need to loop
731 : * quite a few times, updating them as we go. Any inconsistencies
732 : * should eventually be corrected and the loop should end. Looping
733 : * indefinitely is nevertheless scary, so provide an emergency
734 : * valve.
735 : */
5304 heikki.linnakangas 736 CBC 1538 : if (restarts++ > 10000)
5304 heikki.linnakangas 737 UIC 0 : return InvalidBlockNumber;
7341 tgl 738 ECB :
739 : /* Start search all over from the root */
5304 heikki.linnakangas 740 GIC 1538 : addr = FSM_ROOT_ADDRESS;
741 : }
742 : }
743 : }
7506 tgl 744 ECB :
745 :
746 : /*
747 : * Recursive guts of FreeSpaceMapVacuum
748 : *
749 : * Examine the FSM page indicated by addr, as well as its children, updating
750 : * upper-level nodes that cover the heap block range from start to end-1.
751 : * (It's okay if end is beyond the actual end of the map.)
752 : * Return the maximum freespace value on this page.
1837 753 : *
754 : * If addr is past the end of the FSM, set *eof_p to true and return 0.
755 : *
756 : * This traverses the tree in depth-first order. The tree is stored
757 : * physically in depth-first order, so this should be pretty I/O efficient.
758 : */
759 : static uint8
1837 tgl 760 GIC 46787 : fsm_vacuum_page(Relation rel, FSMAddress addr,
761 : BlockNumber start, BlockNumber end,
1837 tgl 762 ECB : bool *eof_p)
7951 763 : {
764 : Buffer buf;
5050 bruce 765 : Page page;
766 : uint8 max_avail;
7341 tgl 767 :
5304 heikki.linnakangas 768 : /* Read the page if it exists, or return EOF */
5304 heikki.linnakangas 769 GIC 46787 : buf = fsm_readbuf(rel, addr, false);
5304 heikki.linnakangas 770 CBC 46787 : if (!BufferIsValid(buf))
771 : {
772 477 : *eof_p = true;
773 477 : return 0;
7951 tgl 774 EUB : }
5304 heikki.linnakangas 775 : else
5304 heikki.linnakangas 776 GIC 46310 : *eof_p = false;
7341 tgl 777 EUB :
2545 kgrittn 778 GIC 46310 : page = BufferGetPage(buf);
7951 tgl 779 ECB :
5304 heikki.linnakangas 780 : /*
1837 tgl 781 : * If we're above the bottom level, recurse into children, and fix the
782 : * information stored about them at this level.
783 : */
5304 heikki.linnakangas 784 GBC 46310 : if (addr.level > FSM_BOTTOM_LEVEL)
785 : {
1837 tgl 786 ECB : FSMAddress fsm_start,
787 : fsm_end;
788 : uint16 fsm_start_slot,
789 : fsm_end_slot;
790 : int slot,
791 : start_slot,
792 : end_slot;
5050 bruce 793 CBC 30912 : bool eof = false;
7341 tgl 794 ECB :
795 : /*
796 : * Compute the range of slots we need to update on this page, given
797 : * the requested range of heap blocks to consider. The first slot to
1837 798 : * update is the one covering the "start" block, and the last slot is
799 : * the one covering "end - 1". (Some of this work will be duplicated
800 : * in each recursive call, but it's cheap enough to not worry about.)
801 : */
1837 tgl 802 GIC 30912 : fsm_start = fsm_get_location(start, &fsm_start_slot);
1837 tgl 803 CBC 30912 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
1837 tgl 804 ECB :
1837 tgl 805 CBC 77280 : while (fsm_start.level < addr.level)
1837 tgl 806 ECB : {
1837 tgl 807 GIC 46368 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
808 46368 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
809 : }
810 30912 : Assert(fsm_start.level == addr.level);
811 :
1837 tgl 812 CBC 30912 : if (fsm_start.logpageno == addr.logpageno)
1837 tgl 813 GIC 30912 : start_slot = fsm_start_slot;
1837 tgl 814 UIC 0 : else if (fsm_start.logpageno > addr.logpageno)
815 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
816 : else
817 0 : start_slot = 0;
818 :
1837 tgl 819 GIC 30912 : if (fsm_end.logpageno == addr.logpageno)
1837 tgl 820 CBC 30683 : end_slot = fsm_end_slot;
1837 tgl 821 GIC 229 : else if (fsm_end.logpageno > addr.logpageno)
1837 tgl 822 CBC 229 : end_slot = SlotsPerFSMPage - 1;
823 : else
1837 tgl 824 LBC 0 : end_slot = -1; /* shouldn't get here... */
825 :
1837 tgl 826 GIC 1052707 : for (slot = start_slot; slot <= end_slot; slot++)
827 : {
828 : int child_avail;
829 :
4807 830 1021795 : CHECK_FOR_INTERRUPTS();
831 :
832 : /* After we hit end-of-file, just clear the rest of the slots */
5304 heikki.linnakangas 833 1021795 : if (!eof)
1837 tgl 834 31312 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
835 : start, end,
836 : &eof);
837 : else
5304 heikki.linnakangas 838 990483 : child_avail = 0;
839 :
840 : /* Update information about the child */
841 1021795 : if (fsm_get_avail(page, slot) != child_avail)
842 : {
843 28648 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1837 tgl 844 28648 : fsm_set_avail(page, slot, child_avail);
3583 jdavis 845 28648 : MarkBufferDirtyHint(buf, false);
5304 heikki.linnakangas 846 28648 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
847 : }
848 : }
849 : }
850 :
851 : /* Now get the maximum value on the page, to return to caller */
1837 tgl 852 46310 : max_avail = fsm_get_max_avail(page);
853 :
854 : /*
855 : * Reset the next slot pointer. This encourages the use of low-numbered
856 : * pages, increasing the chances that a later vacuum can truncate the
857 : * relation. We don't bother with a lock here, nor with marking the page
858 : * dirty if it wasn't already, since this is just a hint.
859 : */
5304 heikki.linnakangas 860 46310 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
861 :
862 46310 : ReleaseBuffer(buf);
863 :
864 46310 : return max_avail;
865 : }
|