Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freespace.c
4 : : * POSTGRES free space map for quickly finding free space in relations
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/freespace/freespace.c
12 : : *
13 : : *
14 : : * NOTES:
15 : : *
16 : : * Free Space Map keeps track of the amount of free space on pages, and
17 : : * allows quickly searching for a page with enough free space. The FSM is
18 : : * stored in a dedicated relation fork of all heap relations, and those
19 : : * index access methods that need it (see also indexfsm.c). See README for
20 : : * more information.
21 : : *
22 : : *-------------------------------------------------------------------------
23 : : */
24 : : #include "postgres.h"
25 : :
26 : : #include "access/htup_details.h"
27 : : #include "access/xloginsert.h"
28 : : #include "access/xlogutils.h"
29 : : #include "miscadmin.h"
30 : : #include "storage/freespace.h"
31 : : #include "storage/fsm_internals.h"
32 : : #include "storage/smgr.h"
33 : : #include "utils/rel.h"
34 : :
35 : :
36 : : /*
37 : : * We use just one byte to store the amount of free space on a page, so we
38 : : * divide the amount of free space a page can have into 256 different
39 : : * categories. The highest category, 255, represents a page with at least
40 : : * MaxFSMRequestSize bytes of free space, and the second highest category
41 : : * represents the range from 254 * FSM_CAT_STEP, inclusive, to
42 : : * MaxFSMRequestSize, exclusive.
43 : : *
44 : : * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
45 : : * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
46 : : * categories look like this:
47 : : *
48 : : *
49 : : * Range Category
50 : : * 0 - 31 0
51 : : * 32 - 63 1
52 : : * ... ... ...
53 : : * 8096 - 8127 253
54 : : * 8128 - 8163 254
55 : : * 8164 - 8192 255
56 : : *
57 : : * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
58 : : * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
59 : : * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
60 : : * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
61 : : * completely empty page, that would mean that we could never satisfy a
62 : : * request of exactly MaxFSMRequestSize bytes.
63 : : */
64 : : #define FSM_CATEGORIES 256
65 : : #define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
66 : : #define MaxFSMRequestSize MaxHeapTupleSize
67 : :
68 : : /*
69 : : * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
70 : : * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
71 : : * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
72 : : * this means that 4096 bytes is the smallest BLCKSZ that we can get away
73 : : * with a 3-level tree, and 512 is the smallest we support.
74 : : */
75 : : #define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
76 : :
77 : : #define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
78 : : #define FSM_BOTTOM_LEVEL 0
79 : :
80 : : /*
81 : : * The internal FSM routines work on a logical addressing scheme. Each
82 : : * level of the tree can be thought of as a separately addressable file.
83 : : */
84 : : typedef struct
85 : : {
86 : : int level; /* level */
87 : : int logpageno; /* page number within the level */
88 : : } FSMAddress;
89 : :
90 : : /* Address of the root page. */
91 : : static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
92 : :
93 : : /* functions to navigate the tree */
94 : : static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
95 : : static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
96 : : static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
97 : : static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
98 : : static BlockNumber fsm_logical_to_physical(FSMAddress addr);
99 : :
100 : : static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
101 : : static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
102 : :
103 : : /* functions to convert amount of free space to a FSM category */
104 : : static uint8 fsm_space_avail_to_cat(Size avail);
105 : : static uint8 fsm_space_needed_to_cat(Size needed);
106 : : static Size fsm_space_cat_to_avail(uint8 cat);
107 : :
108 : : /* workhorse functions for various operations */
109 : : static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
110 : : uint8 newValue, uint8 minValue);
111 : : static BlockNumber fsm_search(Relation rel, uint8 min_cat);
112 : : static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
113 : : BlockNumber start, BlockNumber end,
114 : : bool *eof_p);
115 : : static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
116 : :
117 : :
118 : : /******** Public API ********/
119 : :
120 : : /*
121 : : * GetPageWithFreeSpace - try to find a page in the given relation with
122 : : * at least the specified amount of free space.
123 : : *
124 : : * If successful, return the block number; if not, return InvalidBlockNumber.
125 : : *
126 : : * The caller must be prepared for the possibility that the returned page
127 : : * will turn out to have too little space available by the time the caller
128 : : * gets a lock on it. In that case, the caller should report the actual
129 : : * amount of free space available on that page and then try again (see
130 : : * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
131 : : * extend the relation.
132 : : *
133 : : * This can trigger FSM updates if any FSM entry is found to point to a block
134 : : * past the end of the relation.
135 : : */
136 : : BlockNumber
1804 akapila@postgresql.o 137 :CBC 79155 : GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
138 : : {
5421 bruce@momjian.us 139 : 79155 : uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
140 : :
1804 akapila@postgresql.o 141 : 79155 : return fsm_search(rel, min_cat);
142 : : }
143 : :
144 : : /*
145 : : * RecordAndGetPageWithFreeSpace - update info about a page and try again.
146 : : *
147 : : * We provide this combo form to save some locking overhead, compared to
148 : : * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
149 : : * also some effort to return a page close to the old page; if there's a
150 : : * page with enough free space on the same FSM page where the old one page
151 : : * is located, it is preferred.
152 : : */
153 : : BlockNumber
5675 heikki.linnakangas@i 154 : 94040 : RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
155 : : Size oldSpaceAvail, Size spaceNeeded)
156 : : {
1804 akapila@postgresql.o 157 : 94040 : int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
158 : 94040 : int search_cat = fsm_space_needed_to_cat(spaceNeeded);
159 : : FSMAddress addr;
160 : : uint16 slot;
161 : : int search_slot;
162 : :
163 : : /* Get the location of the FSM byte representing the heap block */
5675 heikki.linnakangas@i 164 : 94040 : addr = fsm_get_location(oldPage, &slot);
165 : :
166 : 94040 : search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
167 : :
168 : : /*
169 : : * If fsm_set_and_search found a suitable new block, return that.
170 : : * Otherwise, search as usual.
171 : : */
172 [ + + ]: 94040 : if (search_slot != -1)
173 : : {
1 noah@leadboat.com 174 : 14359 : BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
175 : :
176 : : /*
177 : : * Check that the blknum is actually in the relation. Don't try to
178 : : * update the FSM in that case, just fall back to the other case
179 : : */
180 [ + - ]: 14359 : if (fsm_does_block_exist(rel, blknum))
181 : 14359 : return blknum;
182 : : }
183 : 79681 : return fsm_search(rel, search_cat);
184 : : }
185 : :
186 : : /*
187 : : * RecordPageWithFreeSpace - update info about a page.
188 : : *
189 : : * Note that if the new spaceAvail value is higher than the old value stored
190 : : * in the FSM, the space might not become visible to searchers until the next
191 : : * FreeSpaceMapVacuum call, which updates the upper level pages.
192 : : */
193 : : void
1804 akapila@postgresql.o 194 : 342333 : RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
195 : : {
196 : 342333 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
197 : : FSMAddress addr;
198 : : uint16 slot;
199 : :
200 : : /* Get the location of the FSM byte representing the heap block */
5675 heikki.linnakangas@i 201 : 342333 : addr = fsm_get_location(heapBlk, &slot);
202 : :
203 : 342333 : fsm_set_and_search(rel, addr, slot, new_cat, 0);
7712 tgl@sss.pgh.pa.us 204 : 342333 : }
205 : :
206 : : /*
207 : : * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
208 : : * WAL replay
209 : : */
210 : : void
648 rhaas@postgresql.org 211 : 357858 : XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
212 : : Size spaceAvail)
213 : : {
5644 heikki.linnakangas@i 214 : 357858 : int new_cat = fsm_space_avail_to_cat(spaceAvail);
215 : : FSMAddress addr;
216 : : uint16 slot;
217 : : BlockNumber blkno;
218 : : Buffer buf;
219 : : Page page;
220 : :
221 : : /* Get the location of the FSM byte representing the heap block */
222 : 357858 : addr = fsm_get_location(heapBlk, &slot);
223 : 357858 : blkno = fsm_logical_to_physical(addr);
224 : :
225 : : /* If the page doesn't exist already, extend */
648 rhaas@postgresql.org 226 : 357858 : buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
227 : : RBM_ZERO_ON_ERROR, InvalidBuffer);
5563 heikki.linnakangas@i 228 : 357858 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
229 : :
2916 kgrittn@postgresql.o 230 : 357858 : page = BufferGetPage(buf);
5644 heikki.linnakangas@i 231 [ + + ]: 357858 : if (PageIsNew(page))
232 : 447 : PageInit(page, BLCKSZ, 0);
233 : :
234 [ + + ]: 357858 : if (fsm_set_avail(page, slot, new_cat))
3954 jdavis@postgresql.or 235 : 350081 : MarkBufferDirtyHint(buf, false);
5644 heikki.linnakangas@i 236 : 357858 : UnlockReleaseBuffer(buf);
237 : 357858 : }
238 : :
239 : : /*
240 : : * GetRecordedFreeSpace - return the amount of free space on a particular page,
241 : : * according to the FSM.
242 : : */
243 : : Size
5675 244 : 1347 : GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
245 : : {
246 : : FSMAddress addr;
247 : : uint16 slot;
248 : : Buffer buf;
249 : : uint8 cat;
250 : :
251 : : /* Get the location of the FSM byte representing the heap block */
252 : 1347 : addr = fsm_get_location(heapBlk, &slot);
253 : :
254 : 1347 : buf = fsm_readbuf(rel, addr, false);
255 [ + + ]: 1347 : if (!BufferIsValid(buf))
256 : 36 : return 0;
2916 kgrittn@postgresql.o 257 : 1311 : cat = fsm_get_avail(BufferGetPage(buf), slot);
5675 heikki.linnakangas@i 258 : 1311 : ReleaseBuffer(buf);
259 : :
260 : 1311 : return fsm_space_cat_to_avail(cat);
261 : : }
262 : :
263 : : /*
264 : : * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
265 : : *
266 : : * nblocks is the new size of the heap.
267 : : *
268 : : * Return the number of blocks of new FSM.
269 : : * If it's InvalidBlockNumber, there is nothing to truncate;
270 : : * otherwise the caller is responsible for calling smgrtruncate()
271 : : * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
272 : : * to update upper-level pages in the FSM.
273 : : */
274 : : BlockNumber
1664 fujii@postgresql.org 275 : 200 : FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
276 : : {
277 : : BlockNumber new_nfsmblocks;
278 : : FSMAddress first_removed_address;
279 : : uint16 first_removed_slot;
280 : : Buffer buf;
281 : :
282 : : /*
283 : : * If no FSM has been created yet for this relation, there's nothing to
284 : : * truncate.
285 : : */
1007 tgl@sss.pgh.pa.us 286 [ - + ]: 200 : if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
1664 fujii@postgresql.org 287 :UBC 0 : return InvalidBlockNumber;
288 : :
289 : : /* Get the location in the FSM of the first removed heap block */
5675 heikki.linnakangas@i 290 :CBC 200 : first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
291 : :
292 : : /*
293 : : * Zero out the tail of the last remaining FSM page. If the slot
294 : : * representing the first removed heap block is at a page boundary, as the
295 : : * first slot on the FSM page that first_removed_address points to, we can
296 : : * just truncate that page altogether.
297 : : */
298 [ + + ]: 200 : if (first_removed_slot > 0)
299 : : {
300 : 122 : buf = fsm_readbuf(rel, first_removed_address, false);
301 [ - + ]: 122 : if (!BufferIsValid(buf))
1431 tgl@sss.pgh.pa.us 302 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
303 : : * smaller */
5675 heikki.linnakangas@i 304 :CBC 122 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
305 : :
306 : : /* NO EREPORT(ERROR) from here till changes are logged */
2734 307 : 122 : START_CRIT_SECTION();
308 : :
2916 kgrittn@postgresql.o 309 : 122 : fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
310 : :
311 : : /*
312 : : * This change is non-critical, because fsm_does_block_exist() would
313 : : * stop us from returning a truncated-away block. However, since this
314 : : * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
315 : : * of that many fsm_does_block_exist() rejections. Use a full
316 : : * MarkBufferDirty(), not MarkBufferDirtyHint().
317 : : */
2734 heikki.linnakangas@i 318 : 122 : MarkBufferDirty(buf);
319 : :
320 : : /*
321 : : * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
322 : : * differing from the rest of the file in this respect. This is
323 : : * optional; see README mention of full page images. XXX consider
324 : : * XLogSaveBufferForHint() for even closer similarity.
325 : : *
326 : : * A higher-level operation calls us at WAL replay. If we crash
327 : : * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
328 : : * not changed, and our fork remains valid. If we crash after that
329 : : * flush, redo will return here.
330 : : */
331 [ + + + + : 122 : if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+ + + - +
- + - +
+ ]
332 : 18 : log_newpage_buffer(buf, false);
333 : :
334 [ - + ]: 122 : END_CRIT_SECTION();
335 : :
5675 336 : 122 : UnlockReleaseBuffer(buf);
337 : :
338 : 122 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
339 : : }
340 : : else
341 : : {
342 : 78 : new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
1007 tgl@sss.pgh.pa.us 343 [ - + ]: 78 : if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
1431 tgl@sss.pgh.pa.us 344 :UBC 0 : return InvalidBlockNumber; /* nothing to do; the FSM was already
345 : : * smaller */
346 : : }
347 : :
1664 fujii@postgresql.org 348 :CBC 200 : return new_nfsmblocks;
349 : : }
350 : :
351 : : /*
352 : : * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
353 : : *
354 : : * We assume that the bottom-level pages have already been updated with
355 : : * new free-space information.
356 : : */
357 : : void
5675 heikki.linnakangas@i 358 : 166 : FreeSpaceMapVacuum(Relation rel)
359 : : {
360 : : bool dummy;
361 : :
362 : : /* Recursively scan the tree, starting at the root */
2208 tgl@sss.pgh.pa.us 363 : 166 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
364 : : (BlockNumber) 0, InvalidBlockNumber,
365 : : &dummy);
366 : 166 : }
367 : :
368 : : /*
369 : : * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
370 : : *
371 : : * As above, but assume that only heap pages between start and end-1 inclusive
372 : : * have new free-space information, so update only the upper-level slots
373 : : * covering that block range. end == InvalidBlockNumber is equivalent to
374 : : * "all the rest of the relation".
375 : : */
376 : : void
377 : 34343 : FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
378 : : {
379 : : bool dummy;
380 : :
381 : : /* Recursively scan the tree, starting at the root */
382 [ + + ]: 34343 : if (end > start)
383 : 34211 : (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
8322 384 : 34343 : }
385 : :
386 : : /******** Internal routines ********/
387 : :
388 : : /*
389 : : * Return category corresponding x bytes of free space
390 : : */
391 : : static uint8
5675 heikki.linnakangas@i 392 : 794231 : fsm_space_avail_to_cat(Size avail)
393 : : {
394 : : int cat;
395 : :
396 [ - + ]: 794231 : Assert(avail < BLCKSZ);
397 : :
398 [ + + ]: 794231 : if (avail >= MaxFSMRequestSize)
399 : 20642 : return 255;
400 : :
401 : 773589 : cat = avail / FSM_CAT_STEP;
402 : :
403 : : /*
404 : : * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
405 : : * more.
406 : : */
407 [ - + ]: 773589 : if (cat > 254)
5675 heikki.linnakangas@i 408 :UBC 0 : cat = 254;
409 : :
5675 heikki.linnakangas@i 410 :CBC 773589 : return (uint8) cat;
411 : : }
412 : :
413 : : /*
414 : : * Return the lower bound of the range of free space represented by given
415 : : * category.
416 : : */
417 : : static Size
418 : 1311 : fsm_space_cat_to_avail(uint8 cat)
419 : : {
420 : : /* The highest category represents exactly MaxFSMRequestSize bytes. */
421 [ + + ]: 1311 : if (cat == 255)
422 : 800 : return MaxFSMRequestSize;
423 : : else
424 : 511 : return cat * FSM_CAT_STEP;
425 : : }
426 : :
427 : : /*
428 : : * Which category does a page need to have, to accommodate x bytes of data?
429 : : * While fsm_space_avail_to_cat() rounds down, this needs to round up.
430 : : */
431 : : static uint8
432 : 173195 : fsm_space_needed_to_cat(Size needed)
433 : : {
434 : : int cat;
435 : :
436 : : /* Can't ask for more space than the highest category represents */
437 [ - + ]: 173195 : if (needed > MaxFSMRequestSize)
3734 tgl@sss.pgh.pa.us 438 [ # # ]:UBC 0 : elog(ERROR, "invalid FSM request size %zu", needed);
439 : :
5675 heikki.linnakangas@i 440 [ - + ]:CBC 173195 : if (needed == 0)
5675 heikki.linnakangas@i 441 :UBC 0 : return 1;
442 : :
5675 heikki.linnakangas@i 443 :CBC 173195 : cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
444 : :
445 [ - + ]: 173195 : if (cat > 255)
5675 heikki.linnakangas@i 446 :UBC 0 : cat = 255;
447 : :
5675 heikki.linnakangas@i 448 :CBC 173195 : return (uint8) cat;
449 : : }
450 : :
451 : : /*
452 : : * Returns the physical block number of a FSM page
453 : : */
454 : : static BlockNumber
455 : 1085906 : fsm_logical_to_physical(FSMAddress addr)
456 : : {
457 : : BlockNumber pages;
458 : : int leafno;
459 : : int l;
460 : :
461 : : /*
462 : : * Calculate the logical page number of the first leaf page below the
463 : : * given page.
464 : : */
465 : 1085906 : leafno = addr.logpageno;
466 [ + + ]: 1525679 : for (l = 0; l < addr.level; l++)
467 : 439773 : leafno *= SlotsPerFSMPage;
468 : :
469 : : /* Count upper level nodes required to address the leaf page */
470 : 1085906 : pages = 0;
471 [ + + ]: 4343624 : for (l = 0; l < FSM_TREE_DEPTH; l++)
472 : : {
473 : 3257718 : pages += leafno + 1;
474 : 3257718 : leafno /= SlotsPerFSMPage;
475 : : }
476 : :
477 : : /*
478 : : * If the page we were asked for wasn't at the bottom level, subtract the
479 : : * additional lower level pages we counted above.
480 : : */
481 : 1085906 : pages -= addr.level;
482 : :
483 : : /* Turn the page count into 0-based block number */
484 : 1085906 : return pages - 1;
485 : : }
486 : :
487 : : /*
488 : : * Return the FSM location corresponding to given heap block.
489 : : */
490 : : static FSMAddress
491 : 933210 : fsm_get_location(BlockNumber heapblk, uint16 *slot)
492 : : {
493 : : FSMAddress addr;
494 : :
495 : 933210 : addr.level = FSM_BOTTOM_LEVEL;
496 : 933210 : addr.logpageno = heapblk / SlotsPerFSMPage;
497 : 933210 : *slot = heapblk % SlotsPerFSMPage;
498 : :
499 : 933210 : return addr;
500 : : }
501 : :
502 : : /*
503 : : * Return the heap block number corresponding to given location in the FSM.
504 : : */
505 : : static BlockNumber
506 : 24964 : fsm_get_heap_blk(FSMAddress addr, uint16 slot)
507 : : {
508 [ - + ]: 24964 : Assert(addr.level == FSM_BOTTOM_LEVEL);
509 : 24964 : return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
510 : : }
511 : :
512 : : /*
513 : : * Given a logical address of a child page, get the logical page number of
514 : : * the parent, and the slot within the parent corresponding to the child.
515 : : */
516 : : static FSMAddress
517 : 207934 : fsm_get_parent(FSMAddress child, uint16 *slot)
518 : : {
519 : : FSMAddress parent;
520 : :
521 [ - + ]: 207934 : Assert(child.level < FSM_ROOT_LEVEL);
522 : :
523 : 207934 : parent.level = child.level + 1;
524 : 207934 : parent.logpageno = child.logpageno / SlotsPerFSMPage;
525 : 207934 : *slot = child.logpageno % SlotsPerFSMPage;
526 : :
527 : 207934 : return parent;
528 : : }
529 : :
530 : : /*
531 : : * Given a logical address of a parent page and a slot number, get the
532 : : * logical address of the corresponding child page.
533 : : */
534 : : static FSMAddress
535 : 93221 : fsm_get_child(FSMAddress parent, uint16 slot)
536 : : {
537 : : FSMAddress child;
538 : :
539 [ - + ]: 93221 : Assert(parent.level > FSM_BOTTOM_LEVEL);
540 : :
541 : 93221 : child.level = parent.level - 1;
542 : 93221 : child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
543 : :
544 : 93221 : return child;
545 : : }
546 : :
547 : : /*
548 : : * Read a FSM page.
549 : : *
550 : : * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
551 : : * true, the FSM file is extended.
552 : : */
553 : : static Buffer
554 : 727848 : fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
555 : : {
556 : 727848 : BlockNumber blkno = fsm_logical_to_physical(addr);
557 : : Buffer buf;
74 heikki.linnakangas@i 558 :GNC 727848 : SMgrRelation reln = RelationGetSmgr(rel);
559 : :
560 : : /*
561 : : * If we haven't cached the size of the FSM yet, check it first. Also
562 : : * recheck if the requested block seems to be past end, since our cached
563 : : * value might be stale. (We send smgr inval messages on truncation, but
564 : : * not on extension.)
565 : : */
1007 tgl@sss.pgh.pa.us 566 [ + + ]:CBC 727848 : if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
567 [ + + ]: 644032 : blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
568 : : {
569 : : /* Invalidate the cache so smgrnblocks asks the kernel. */
570 : 114708 : reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
571 [ + + ]: 114708 : if (smgrexists(reln, FSM_FORKNUM))
572 : 57820 : smgrnblocks(reln, FSM_FORKNUM);
573 : : else
574 : 56888 : reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
575 : : }
576 : :
577 : : /*
578 : : * For reading we use ZERO_ON_ERROR mode, and initialize the page if
579 : : * necessary. The FSM information is not accurate anyway, so it's better
580 : : * to clear corrupt pages than error out. Since the FSM changes are not
581 : : * WAL-logged, the so-called torn page problem on crash can lead to pages
582 : : * with corrupt headers, for example.
583 : : *
584 : : * We use the same path below to initialize pages when extending the
585 : : * relation, as a concurrent extension can end up with vm_extend()
586 : : * returning an already-initialized page.
587 : : */
588 [ + + ]: 727848 : if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
589 : : {
5675 heikki.linnakangas@i 590 [ + + ]: 57596 : if (extend)
375 andres@anarazel.de 591 : 3400 : buf = fsm_extend(rel, blkno + 1);
592 : : else
5675 heikki.linnakangas@i 593 : 54196 : return InvalidBuffer;
594 : : }
595 : : else
375 andres@anarazel.de 596 : 670252 : buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
597 : :
598 : : /*
599 : : * Initializing the page when needed is trickier than it looks, because of
600 : : * the possibility of multiple backends doing this concurrently, and our
601 : : * desire to not uselessly take the buffer lock in the normal path where
602 : : * the page is OK. We must take the lock to initialize the page, so
603 : : * recheck page newness after we have the lock, in case someone else
604 : : * already did it. Also, because we initially check PageIsNew with no
605 : : * lock, it's possible to fall through and return the buffer while someone
606 : : * else is still initializing the page (i.e., we might see pd_upper as set
607 : : * but other page header fields are still zeroes). This is harmless for
608 : : * callers that will take a buffer lock themselves, but some callers
609 : : * inspect the page without any lock at all. The latter is OK only so
610 : : * long as it doesn't depend on the page header having correct contents.
611 : : * Current usage is safe because PageGetContents() does not require that.
612 : : */
2916 kgrittn@postgresql.o 613 [ + + ]: 673652 : if (PageIsNew(BufferGetPage(buf)))
614 : : {
2102 tgl@sss.pgh.pa.us 615 : 11409 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
616 [ + - ]: 11409 : if (PageIsNew(BufferGetPage(buf)))
617 : 11409 : PageInit(BufferGetPage(buf), BLCKSZ, 0);
618 : 11409 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
619 : : }
5644 heikki.linnakangas@i 620 : 673652 : return buf;
621 : : }
622 : :
623 : : /*
624 : : * Ensure that the FSM fork is at least fsm_nblocks long, extending
625 : : * it if necessary with empty pages. And by empty, I mean pages filled
626 : : * with zeros, meaning there's no free space.
627 : : */
628 : : static Buffer
5618 629 : 3400 : fsm_extend(Relation rel, BlockNumber fsm_nblocks)
630 : : {
235 tmunro@postgresql.or 631 : 3400 : return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
632 : : EB_CREATE_FORK_IF_NEEDED |
633 : : EB_CLEAR_SIZE_CACHE,
634 : : fsm_nblocks,
635 : : RBM_ZERO_ON_ERROR);
636 : : }
637 : :
638 : : /*
639 : : * Set value in given FSM page and slot.
640 : : *
641 : : * If minValue > 0, the updated page is also searched for a page with at
642 : : * least minValue of free space. If one is found, its slot number is
643 : : * returned, -1 otherwise.
644 : : */
645 : : static int
5675 heikki.linnakangas@i 646 : 438159 : fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
647 : : uint8 newValue, uint8 minValue)
648 : : {
649 : : Buffer buf;
650 : : Page page;
651 : 438159 : int newslot = -1;
652 : :
653 : 438159 : buf = fsm_readbuf(rel, addr, true);
654 : 438159 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
655 : :
2916 kgrittn@postgresql.o 656 : 438159 : page = BufferGetPage(buf);
657 : :
5675 heikki.linnakangas@i 658 [ + + ]: 438159 : if (fsm_set_avail(page, slot, newValue))
3954 jdavis@postgresql.or 659 : 95518 : MarkBufferDirtyHint(buf, false);
660 : :
5675 heikki.linnakangas@i 661 [ + + ]: 438159 : if (minValue != 0)
662 : : {
663 : : /* Search while we still hold the lock */
664 : 94040 : newslot = fsm_search_avail(buf, minValue,
665 : 94040 : addr.level == FSM_BOTTOM_LEVEL,
666 : : true);
667 : : }
668 : :
669 : 438159 : UnlockReleaseBuffer(buf);
670 : :
671 : 438159 : return newslot;
672 : : }
673 : :
674 : : /*
675 : : * Search the tree for a heap page with at least min_cat of free space
676 : : */
677 : : static BlockNumber
678 : 158836 : fsm_search(Relation rel, uint8 min_cat)
679 : : {
5421 bruce@momjian.us 680 : 158836 : int restarts = 0;
681 : 158836 : FSMAddress addr = FSM_ROOT_ADDRESS;
682 : :
683 : : for (;;)
8322 tgl@sss.pgh.pa.us 684 : 25675 : {
685 : : int slot;
686 : : Buffer buf;
5421 bruce@momjian.us 687 : 184511 : uint8 max_avail = 0;
688 : :
689 : : /* Read the FSM page. */
5617 heikki.linnakangas@i 690 : 184511 : buf = fsm_readbuf(rel, addr, false);
691 : :
692 : : /* Search within the page */
5675 693 [ + + ]: 184511 : if (BufferIsValid(buf))
694 : : {
695 : 131064 : LockBuffer(buf, BUFFER_LOCK_SHARE);
696 : 131064 : slot = fsm_search_avail(buf, min_cat,
697 : 131064 : (addr.level == FSM_BOTTOM_LEVEL),
698 : : false);
699 [ + + ]: 131064 : if (slot == -1)
700 : : {
2916 kgrittn@postgresql.o 701 : 96570 : max_avail = fsm_get_max_avail(BufferGetPage(buf));
1 noah@leadboat.com 702 : 96570 : UnlockReleaseBuffer(buf);
703 : : }
704 : : else
705 : : {
706 : : /* Keep the pin for possible update below */
707 : 34494 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
708 : : }
709 : : }
710 : : else
5675 heikki.linnakangas@i 711 : 53447 : slot = -1;
712 : :
713 [ + + ]: 184511 : if (slot != -1)
714 : : {
715 : : /*
716 : : * Descend the tree, or return the found block if we're at the
717 : : * bottom.
718 : : */
719 [ + + ]: 34494 : if (addr.level == FSM_BOTTOM_LEVEL)
720 : : {
1 noah@leadboat.com 721 : 10605 : BlockNumber blkno = fsm_get_heap_blk(addr, slot);
722 : : Page page;
723 : :
724 [ + - ]: 10605 : if (fsm_does_block_exist(rel, blkno))
725 : : {
726 : 10605 : ReleaseBuffer(buf);
727 : 10605 : return blkno;
728 : : }
729 : :
730 : : /*
731 : : * Block is past the end of the relation. Update FSM, and
732 : : * restart from root. The usual "advancenext" behavior is
733 : : * pessimal for this rare scenario, since every later slot is
734 : : * unusable in the same way. We could zero all affected slots
735 : : * on the same FSM page, but don't bet on the benefits of that
736 : : * optimization justifying its compiled code bulk.
737 : : */
1 noah@leadboat.com 738 :UBC 0 : page = BufferGetPage(buf);
739 : 0 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
740 : 0 : fsm_set_avail(page, slot, 0);
741 : 0 : MarkBufferDirtyHint(buf, false);
742 : 0 : UnlockReleaseBuffer(buf);
743 [ # # ]: 0 : if (restarts++ > 10000) /* same rationale as below */
744 : 0 : return InvalidBlockNumber;
745 : 0 : addr = FSM_ROOT_ADDRESS;
746 : : }
747 : : else
748 : : {
1 noah@leadboat.com 749 :CBC 23889 : ReleaseBuffer(buf);
750 : : }
5675 heikki.linnakangas@i 751 : 23889 : addr = fsm_get_child(addr, slot);
752 : : }
753 [ + + ]: 150017 : else if (addr.level == FSM_ROOT_LEVEL)
754 : : {
755 : : /*
756 : : * At the root, failure means there's no page with enough free
757 : : * space in the FSM. Give up.
758 : : */
759 : 148231 : return InvalidBlockNumber;
760 : : }
761 : : else
762 : : {
763 : : uint16 parentslot;
764 : : FSMAddress parent;
765 : :
766 : : /*
767 : : * At lower level, failure can happen if the value in the upper-
768 : : * level node didn't reflect the value on the lower page. Update
769 : : * the upper node, to avoid falling into the same trap again, and
770 : : * start over.
771 : : *
772 : : * There's a race condition here, if another backend updates this
773 : : * page right after we release it, and gets the lock on the parent
774 : : * page before us. We'll then update the parent page with the now
775 : : * stale information we had. It's OK, because it should happen
776 : : * rarely, and will be fixed by the next vacuum.
777 : : */
778 : 1786 : parent = fsm_get_parent(addr, &parentslot);
779 : 1786 : fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
780 : :
781 : : /*
782 : : * If the upper pages are badly out of date, we might need to loop
783 : : * quite a few times, updating them as we go. Any inconsistencies
784 : : * should eventually be corrected and the loop should end. Looping
785 : : * indefinitely is nevertheless scary, so provide an emergency
786 : : * valve.
787 : : */
788 [ - + ]: 1786 : if (restarts++ > 10000)
5675 heikki.linnakangas@i 789 :UBC 0 : return InvalidBlockNumber;
790 : :
791 : : /* Start search all over from the root */
5675 heikki.linnakangas@i 792 :CBC 1786 : addr = FSM_ROOT_ADDRESS;
793 : : }
794 : : }
795 : : }
796 : :
797 : :
798 : : /*
799 : : * Recursive guts of FreeSpaceMapVacuum
800 : : *
801 : : * Examine the FSM page indicated by addr, as well as its children, updating
802 : : * upper-level nodes that cover the heap block range from start to end-1.
803 : : * (It's okay if end is beyond the actual end of the map.)
804 : : * Return the maximum freespace value on this page.
805 : : *
806 : : * If addr is past the end of the FSM, set *eof_p to true and return 0.
807 : : *
808 : : * This traverses the tree in depth-first order. The tree is stored
809 : : * physically in depth-first order, so this should be pretty I/O efficient.
810 : : */
811 : : static uint8
2208 tgl@sss.pgh.pa.us 812 : 103709 : fsm_vacuum_page(Relation rel, FSMAddress addr,
813 : : BlockNumber start, BlockNumber end,
814 : : bool *eof_p)
815 : : {
816 : : Buffer buf;
817 : : Page page;
818 : : uint8 max_avail;
819 : :
820 : : /* Read the page if it exists, or return EOF */
5675 heikki.linnakangas@i 821 : 103709 : buf = fsm_readbuf(rel, addr, false);
822 [ + + ]: 103709 : if (!BufferIsValid(buf))
823 : : {
824 : 713 : *eof_p = true;
825 : 713 : return 0;
826 : : }
827 : : else
828 : 102996 : *eof_p = false;
829 : :
2916 kgrittn@postgresql.o 830 : 102996 : page = BufferGetPage(buf);
831 : :
832 : : /*
833 : : * If we're above the bottom level, recurse into children, and fix the
834 : : * information stored about them at this level.
835 : : */
5675 heikki.linnakangas@i 836 [ + + ]: 102996 : if (addr.level > FSM_BOTTOM_LEVEL)
837 : : {
838 : : FSMAddress fsm_start,
839 : : fsm_end;
840 : : uint16 fsm_start_slot,
841 : : fsm_end_slot;
842 : : int slot,
843 : : start_slot,
844 : : end_slot;
5421 bruce@momjian.us 845 : 68716 : bool eof = false;
846 : :
847 : : /*
848 : : * Compute the range of slots we need to update on this page, given
849 : : * the requested range of heap blocks to consider. The first slot to
850 : : * update is the one covering the "start" block, and the last slot is
851 : : * the one covering "end - 1". (Some of this work will be duplicated
852 : : * in each recursive call, but it's cheap enough to not worry about.)
853 : : */
2208 tgl@sss.pgh.pa.us 854 : 68716 : fsm_start = fsm_get_location(start, &fsm_start_slot);
855 : 68716 : fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
856 : :
857 [ + + ]: 171790 : while (fsm_start.level < addr.level)
858 : : {
859 : 103074 : fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
860 : 103074 : fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
861 : : }
862 [ - + ]: 68716 : Assert(fsm_start.level == addr.level);
863 : :
864 [ + - ]: 68716 : if (fsm_start.logpageno == addr.logpageno)
865 : 68716 : start_slot = fsm_start_slot;
2208 tgl@sss.pgh.pa.us 866 [ # # ]:UBC 0 : else if (fsm_start.logpageno > addr.logpageno)
867 : 0 : start_slot = SlotsPerFSMPage; /* shouldn't get here... */
868 : : else
869 : 0 : start_slot = 0;
870 : :
2208 tgl@sss.pgh.pa.us 871 [ + + ]:CBC 68716 : if (fsm_end.logpageno == addr.logpageno)
872 : 68369 : end_slot = fsm_end_slot;
873 [ + - ]: 347 : else if (fsm_end.logpageno > addr.logpageno)
874 : 347 : end_slot = SlotsPerFSMPage - 1;
875 : : else
2208 tgl@sss.pgh.pa.us 876 :UBC 0 : end_slot = -1; /* shouldn't get here... */
877 : :
2208 tgl@sss.pgh.pa.us 878 [ + + ]:CBC 1638901 : for (slot = start_slot; slot <= end_slot; slot++)
879 : : {
880 : : int child_avail;
881 : :
5178 882 [ - + ]: 1570185 : CHECK_FOR_INTERRUPTS();
883 : :
884 : : /* After we hit end-of-file, just clear the rest of the slots */
5675 heikki.linnakangas@i 885 [ + + ]: 1570185 : if (!eof)
2208 tgl@sss.pgh.pa.us 886 : 69332 : child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
887 : : start, end,
888 : : &eof);
889 : : else
5675 heikki.linnakangas@i 890 : 1500853 : child_avail = 0;
891 : :
892 : : /* Update information about the child */
893 [ + + ]: 1570185 : if (fsm_get_avail(page, slot) != child_avail)
894 : : {
895 : 6730 : LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2208 tgl@sss.pgh.pa.us 896 : 6730 : fsm_set_avail(page, slot, child_avail);
3954 jdavis@postgresql.or 897 : 6730 : MarkBufferDirtyHint(buf, false);
5675 heikki.linnakangas@i 898 : 6730 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
899 : : }
900 : : }
901 : : }
902 : :
903 : : /* Now get the maximum value on the page, to return to caller */
2208 tgl@sss.pgh.pa.us 904 : 102996 : max_avail = fsm_get_max_avail(page);
905 : :
906 : : /*
907 : : * Reset the next slot pointer. This encourages the use of low-numbered
908 : : * pages, increasing the chances that a later vacuum can truncate the
909 : : * relation. We don't bother with a lock here, nor with marking the page
910 : : * dirty if it wasn't already, since this is just a hint.
911 : : */
5675 heikki.linnakangas@i 912 : 102996 : ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
913 : :
914 : 102996 : ReleaseBuffer(buf);
915 : :
916 : 102996 : return max_avail;
917 : : }
918 : :
919 : :
920 : : /*
921 : : * Check whether a block number is past the end of the relation. This can
922 : : * happen after WAL replay, if the FSM reached disk but newly-extended pages
923 : : * it refers to did not.
924 : : */
925 : : static bool
1 noah@leadboat.com 926 : 24964 : fsm_does_block_exist(Relation rel, BlockNumber blknumber)
927 : : {
928 : 24964 : SMgrRelation smgr = RelationGetSmgr(rel);
929 : :
930 : : /*
931 : : * If below the cached nblocks, the block surely exists. Otherwise, we
932 : : * face a trade-off. We opt to compare to a fresh nblocks, incurring
933 : : * lseek() overhead. The alternative would be to assume the block does
934 : : * not exist, but that would cause FSM to set zero space available for
935 : : * blocks that main fork extension just recorded.
936 : : */
937 : 24964 : return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
938 [ + + + + : 34096 : blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+ - ]
939 : 9132 : blknumber < RelationGetNumberOfBlocks(rel));
940 : : }
|