Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * freelist.c
4 : : * routines for managing the buffer pool's replacement strategy.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : *
11 : : * IDENTIFICATION
12 : : * src/backend/storage/buffer/freelist.c
13 : : *
14 : : *-------------------------------------------------------------------------
15 : : */
16 : : #include "postgres.h"
17 : :
18 : : #include "pgstat.h"
19 : : #include "port/atomics.h"
20 : : #include "storage/buf_internals.h"
21 : : #include "storage/bufmgr.h"
22 : : #include "storage/proc.h"
23 : :
24 : : #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
25 : :
26 : :
27 : : /*
28 : : * The shared freelist control information.
29 : : */
30 : : typedef struct
31 : : {
32 : : /* Spinlock: protects the values below */
33 : : slock_t buffer_strategy_lock;
34 : :
35 : : /*
36 : : * Clock sweep hand: index of next buffer to consider grabbing. Note that
37 : : * this isn't a concrete buffer - we only ever increase the value. So, to
38 : : * get an actual buffer, it needs to be used modulo NBuffers.
39 : : */
40 : : pg_atomic_uint32 nextVictimBuffer;
41 : :
42 : : int firstFreeBuffer; /* Head of list of unused buffers */
43 : : int lastFreeBuffer; /* Tail of list of unused buffers */
44 : :
45 : : /*
46 : : * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
47 : : * when the list is empty)
48 : : */
49 : :
50 : : /*
51 : : * Statistics. These counters should be wide enough that they can't
52 : : * overflow during a single bgwriter cycle.
53 : : */
54 : : uint32 completePasses; /* Complete cycles of the clock sweep */
55 : : pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
56 : :
57 : : /*
58 : : * Bgworker process to be notified upon activity or -1 if none. See
59 : : * StrategyNotifyBgWriter.
60 : : */
61 : : int bgwprocno;
62 : : } BufferStrategyControl;
63 : :
64 : : /* Pointers to shared state */
65 : : static BufferStrategyControl *StrategyControl = NULL;
66 : :
67 : : /*
68 : : * Private (non-shared) state for managing a ring of shared buffers to re-use.
69 : : * This is currently the only kind of BufferAccessStrategy object, but someday
70 : : * we might have more kinds.
71 : : */
72 : : typedef struct BufferAccessStrategyData
73 : : {
74 : : /* Overall strategy type */
75 : : BufferAccessStrategyType btype;
76 : : /* Number of elements in buffers[] array */
77 : : int nbuffers;
78 : :
79 : : /*
80 : : * Index of the "current" slot in the ring, ie, the one most recently
81 : : * returned by GetBufferFromRing.
82 : : */
83 : : int current;
84 : :
85 : : /*
86 : : * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
87 : : * have not yet selected a buffer for this ring slot. For allocation
88 : : * simplicity this is palloc'd together with the fixed fields of the
89 : : * struct.
90 : : */
91 : : Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
92 : : } BufferAccessStrategyData;
93 : :
94 : :
95 : : /* Prototypes for internal functions */
96 : : static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
97 : : uint32 *buf_state);
98 : : static void AddBufferToRing(BufferAccessStrategy strategy,
99 : : BufferDesc *buf);
100 : :
101 : : /*
102 : : * ClockSweepTick - Helper routine for StrategyGetBuffer()
103 : : *
104 : : * Move the clock hand one buffer ahead of its current position and return the
105 : : * id of the buffer now under the hand.
106 : : */
107 : : static inline uint32
3398 andres@anarazel.de 108 :CBC 3315613 : ClockSweepTick(void)
109 : : {
110 : : uint32 victim;
111 : :
112 : : /*
113 : : * Atomically move hand ahead one buffer - if there's several processes
114 : : * doing this, this can lead to buffers being returned slightly out of
115 : : * apparent order.
116 : : */
117 : : victim =
118 : 3315613 : pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
119 : :
120 [ + + ]: 3315613 : if (victim >= NBuffers)
121 : : {
3249 bruce@momjian.us 122 : 26984 : uint32 originalVictim = victim;
123 : :
124 : : /* always wrap what we look up in BufferDescriptors */
3398 andres@anarazel.de 125 : 26984 : victim = victim % NBuffers;
126 : :
127 : : /*
128 : : * If we're the one that just caused a wraparound, force
129 : : * completePasses to be incremented while holding the spinlock. We
130 : : * need the spinlock so StrategySyncStart() can return a consistent
131 : : * value consisting of nextVictimBuffer and completePasses.
132 : : */
133 [ + + ]: 26984 : if (victim == 0)
134 : : {
135 : : uint32 expected;
136 : : uint32 wrapped;
3249 bruce@momjian.us 137 : 26683 : bool success = false;
138 : :
3398 andres@anarazel.de 139 : 26683 : expected = originalVictim + 1;
140 : :
141 [ + + ]: 53641 : while (!success)
142 : : {
143 : : /*
144 : : * Acquire the spinlock while increasing completePasses. That
145 : : * allows other readers to read nextVictimBuffer and
146 : : * completePasses in a consistent manner which is required for
147 : : * StrategySyncStart(). In theory delaying the increment
148 : : * could lead to an overflow of nextVictimBuffers, but that's
149 : : * highly unlikely and wouldn't be particularly harmful.
150 : : */
151 [ - + ]: 26958 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
152 : :
153 : 26958 : wrapped = expected % NBuffers;
154 : :
155 : 26958 : success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
156 : : &expected, wrapped);
157 [ + + ]: 26958 : if (success)
158 : 26683 : StrategyControl->completePasses++;
159 : 26958 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
160 : : }
161 : : }
162 : : }
163 : 3315613 : return victim;
164 : : }
165 : :
166 : : /*
167 : : * have_free_buffer -- a lockless check to see if there is a free buffer in
168 : : * buffer pool.
169 : : *
170 : : * If the result is true that will become stale once free buffers are moved out
171 : : * by other operations, so the caller who strictly want to use a free buffer
172 : : * should not call this.
173 : : */
174 : : bool
1424 noah@leadboat.com 175 : 196 : have_free_buffer(void)
176 : : {
2428 rhaas@postgresql.org 177 [ + - ]: 196 : if (StrategyControl->firstFreeBuffer >= 0)
178 : 196 : return true;
179 : : else
2428 rhaas@postgresql.org 180 :UBC 0 : return false;
181 : : }
182 : :
183 : : /*
184 : : * StrategyGetBuffer
185 : : *
186 : : * Called by the bufmgr to get the next candidate buffer to use in
187 : : * BufferAlloc(). The only hard requirement BufferAlloc() has is that
188 : : * the selected buffer must not currently be pinned by anyone.
189 : : *
190 : : * strategy is a BufferAccessStrategy object, or NULL for default strategy.
191 : : *
192 : : * To ensure that no one else can pin the buffer before we do, we must
193 : : * return the buffer with the buffer header spinlock still held.
194 : : */
195 : : BufferDesc *
430 andres@anarazel.de 196 :CBC 1587157 : StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
197 : : {
198 : : BufferDesc *buf;
199 : : int bgwprocno;
200 : : int trycounter;
201 : : uint32 local_buf_state; /* to avoid repeated (de-)referencing */
202 : :
203 : 1587157 : *from_ring = false;
204 : :
205 : : /*
206 : : * If given a strategy object, see whether it can select a buffer. We
207 : : * assume strategy objects don't need buffer_strategy_lock.
208 : : */
6164 tgl@sss.pgh.pa.us 209 [ + + ]: 1587157 : if (strategy != NULL)
210 : : {
2926 andres@anarazel.de 211 : 624813 : buf = GetBufferFromRing(strategy, buf_state);
6164 tgl@sss.pgh.pa.us 212 [ + + ]: 624813 : if (buf != NULL)
213 : : {
430 andres@anarazel.de 214 : 226505 : *from_ring = true;
6164 tgl@sss.pgh.pa.us 215 : 226505 : return buf;
216 : : }
217 : : }
218 : :
219 : : /*
220 : : * If asked, we need to waken the bgwriter. Since we don't want to rely on
221 : : * a spinlock for this we force a read from shared memory once, and then
222 : : * set the latch based on that value. We need to go through that length
223 : : * because otherwise bgwprocno might be reset while/after we check because
224 : : * the compiler might just reread from memory.
225 : : *
226 : : * This can possibly set the latch of the wrong process if the bgwriter
227 : : * dies in the wrong moment. But since PGPROC->procLatch is never
228 : : * deallocated the worst consequence of that is that we set the latch of
229 : : * some arbitrary process.
230 : : */
3398 andres@anarazel.de 231 : 1360652 : bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
232 [ + + ]: 1360652 : if (bgwprocno != -1)
233 : : {
234 : : /* reset bgwprocno first, before setting the latch */
235 : 263 : StrategyControl->bgwprocno = -1;
236 : :
237 : : /*
238 : : * Not acquiring ProcArrayLock here which is slightly icky. It's
239 : : * actually fine because procLatch isn't ever freed, so we just can
240 : : * potentially set the wrong process' (or no process') latch.
241 : : */
242 : 263 : SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
243 : : }
244 : :
245 : : /*
246 : : * We count buffer allocation requests so that the bgwriter can estimate
247 : : * the rate of buffer consumption. Note that buffers recycled by a
248 : : * strategy object are intentionally not counted here.
249 : : */
250 : 1360652 : pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
251 : :
252 : : /*
253 : : * First check, without acquiring the lock, whether there's buffers in the
254 : : * freelist. Since we otherwise don't require the spinlock in every
255 : : * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
256 : : * uselessly in most cases. That obviously leaves a race where a buffer is
257 : : * put on the freelist but we don't see the store yet - but that's pretty
258 : : * harmless, it'll just get used during the next buffer acquisition.
259 : : *
260 : : * If there's buffers on the freelist, acquire the spinlock to pop one
261 : : * buffer of the freelist. Then check whether that buffer is usable and
262 : : * repeat if not.
263 : : *
264 : : * Note that the freeNext fields are considered to be protected by the
265 : : * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
266 : : * manipulate them without holding the spinlock.
267 : : */
268 [ + + ]: 1360652 : if (StrategyControl->firstFreeBuffer >= 0)
269 : : {
270 : : while (true)
271 : : {
272 : : /* Acquire the spinlock to remove element from the freelist */
273 [ + + ]: 579594 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
274 : :
275 [ + + ]: 579594 : if (StrategyControl->firstFreeBuffer < 0)
276 : : {
277 : 57 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
278 : 57 : break;
279 : : }
280 : :
3363 281 : 579537 : buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
3398 282 [ - + ]: 579537 : Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
283 : :
284 : : /* Unconditionally remove buffer from freelist */
285 : 579537 : StrategyControl->firstFreeBuffer = buf->freeNext;
286 : 579537 : buf->freeNext = FREENEXT_NOT_IN_LIST;
287 : :
288 : : /*
289 : : * Release the lock so someone else can access the freelist while
290 : : * we check out this buffer.
291 : : */
292 : 579537 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
293 : :
294 : : /*
295 : : * If the buffer is pinned or has a nonzero usage_count, we cannot
296 : : * use it; discard it and retry. (This can only happen if VACUUM
297 : : * put a valid buffer in the freelist and then someone else used
298 : : * it before we got to it. It's probably impossible altogether as
299 : : * of 8.3, but we'd better check anyway.)
300 : : */
2926 301 : 579537 : local_buf_state = LockBufHdr(buf);
302 [ + + ]: 579537 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
303 [ + + ]: 579529 : && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
304 : : {
3398 305 [ + + ]: 579496 : if (strategy != NULL)
306 : 295613 : AddBufferToRing(strategy, buf);
2926 307 : 579496 : *buf_state = local_buf_state;
3398 308 : 579496 : return buf;
309 : : }
2926 310 : 41 : UnlockBufHdr(buf, local_buf_state);
311 : : }
312 : : }
313 : :
314 : : /* Nothing on the freelist, so run the "clock sweep" algorithm */
6981 tgl@sss.pgh.pa.us 315 : 781156 : trycounter = NBuffers;
316 : : for (;;)
317 : : {
3363 andres@anarazel.de 318 : 3315613 : buf = GetBufferDescriptor(ClockSweepTick());
319 : :
320 : : /*
321 : : * If the buffer is pinned or has a nonzero usage_count, we cannot use
322 : : * it; decrement the usage_count (unless pinned) and keep scanning.
323 : : */
2926 324 : 3315613 : local_buf_state = LockBufHdr(buf);
325 : :
326 [ + + ]: 3315613 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
327 : : {
328 [ + + ]: 3257241 : if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
329 : : {
330 : 2476085 : local_buf_state -= BUF_USAGECOUNT_ONE;
331 : :
6164 tgl@sss.pgh.pa.us 332 : 2476085 : trycounter = NBuffers;
333 : : }
334 : : else
335 : : {
336 : : /* Found a usable buffer */
337 [ + + ]: 781156 : if (strategy != NULL)
338 : 102695 : AddBufferToRing(strategy, buf);
2926 andres@anarazel.de 339 : 781156 : *buf_state = local_buf_state;
6164 tgl@sss.pgh.pa.us 340 : 781156 : return buf;
341 : : }
342 : : }
6981 343 [ - + ]: 58372 : else if (--trycounter == 0)
344 : : {
345 : : /*
346 : : * We've scanned all the buffers without making any state changes,
347 : : * so all the buffers are pinned (or were when we looked at them).
348 : : * We could hope that someone will free one eventually, but it's
349 : : * probably better to fail than to risk getting stuck in an
350 : : * infinite loop.
351 : : */
2926 andres@anarazel.de 352 :UBC 0 : UnlockBufHdr(buf, local_buf_state);
7300 tgl@sss.pgh.pa.us 353 [ # # ]: 0 : elog(ERROR, "no unpinned buffers available");
354 : : }
2926 andres@anarazel.de 355 :CBC 2534457 : UnlockBufHdr(buf, local_buf_state);
356 : : }
357 : : }
358 : :
359 : : /*
360 : : * StrategyFreeBuffer: put a buffer on the freelist
361 : : */
362 : : void
3072 rhaas@postgresql.org 363 : 96356 : StrategyFreeBuffer(BufferDesc *buf)
364 : : {
3489 365 [ + + ]: 96356 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
366 : :
367 : : /*
368 : : * It is possible that we are told to put something in the freelist that
369 : : * is already in it; don't screw up the list if so.
370 : : */
6981 tgl@sss.pgh.pa.us 371 [ + + ]: 96356 : if (buf->freeNext == FREENEXT_NOT_IN_LIST)
372 : : {
6164 373 : 96347 : buf->freeNext = StrategyControl->firstFreeBuffer;
374 [ + + ]: 96347 : if (buf->freeNext < 0)
6981 375 : 4155 : StrategyControl->lastFreeBuffer = buf->buf_id;
6164 376 : 96347 : StrategyControl->firstFreeBuffer = buf->buf_id;
377 : : }
378 : :
3489 rhaas@postgresql.org 379 : 96356 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
6981 tgl@sss.pgh.pa.us 380 : 96356 : }
381 : :
382 : : /*
383 : : * StrategySyncStart -- tell BufferSync where to start syncing
384 : : *
385 : : * The result is the buffer index of the best buffer to sync first.
386 : : * BufferSync() will proceed circularly around the buffer array from there.
387 : : *
388 : : * In addition, we return the completed-pass count (which is effectively
389 : : * the higher-order bits of nextVictimBuffer) and the count of recent buffer
390 : : * allocs if non-NULL pointers are passed. The alloc count is reset after
391 : : * being read.
392 : : */
393 : : int
6046 394 : 15299 : StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
395 : : {
396 : : uint32 nextVictimBuffer;
397 : : int result;
398 : :
3489 rhaas@postgresql.org 399 [ - + ]: 15299 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
3398 andres@anarazel.de 400 : 15299 : nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
401 : 15299 : result = nextVictimBuffer % NBuffers;
402 : :
6046 tgl@sss.pgh.pa.us 403 [ + - ]: 15299 : if (complete_passes)
404 : : {
405 : 15299 : *complete_passes = StrategyControl->completePasses;
406 : :
407 : : /*
408 : : * Additionally add the number of wraparounds that happened before
409 : : * completePasses could be incremented. C.f. ClockSweepTick().
410 : : */
3398 andres@anarazel.de 411 : 15299 : *complete_passes += nextVictimBuffer / NBuffers;
412 : : }
413 : :
6046 tgl@sss.pgh.pa.us 414 [ + - ]: 15299 : if (num_buf_alloc)
415 : : {
3398 andres@anarazel.de 416 : 15299 : *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
417 : : }
3489 rhaas@postgresql.org 418 : 15299 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
6981 tgl@sss.pgh.pa.us 419 : 15299 : return result;
420 : : }
421 : :
422 : : /*
423 : : * StrategyNotifyBgWriter -- set or clear allocation notification latch
424 : : *
425 : : * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
426 : : * set that latch. Pass -1 to clear the pending notification before it
427 : : * happens. This feature is used by the bgwriter process to wake itself up
428 : : * from hibernation, and is not meant for anybody else to use.
429 : : */
430 : : void
3398 andres@anarazel.de 431 : 694 : StrategyNotifyBgWriter(int bgwprocno)
432 : : {
433 : : /*
434 : : * We acquire buffer_strategy_lock just to ensure that the store appears
435 : : * atomic to StrategyGetBuffer. The bgwriter should call this rather
436 : : * infrequently, so there's no performance penalty from being safe.
437 : : */
3489 rhaas@postgresql.org 438 [ - + ]: 694 : SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
3398 andres@anarazel.de 439 : 694 : StrategyControl->bgwprocno = bgwprocno;
3489 rhaas@postgresql.org 440 : 694 : SpinLockRelease(&StrategyControl->buffer_strategy_lock);
4358 tgl@sss.pgh.pa.us 441 : 694 : }
442 : :
443 : :
444 : : /*
445 : : * StrategyShmemSize
446 : : *
447 : : * estimate the size of shared memory used by the freelist-related structures.
448 : : *
449 : : * Note: for somewhat historical reasons, the buffer lookup hashtable size
450 : : * is also determined here.
451 : : */
452 : : Size
7010 453 : 1679 : StrategyShmemSize(void)
454 : : {
6812 455 : 1679 : Size size = 0;
456 : :
457 : : /* size of lookup hash table ... see comment in StrategyInitialize */
6475 458 : 1679 : size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
459 : :
460 : : /* size of the shared replacement strategy control block */
6812 461 : 1679 : size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
462 : :
7010 463 : 1679 : return size;
464 : : }
465 : :
466 : : /*
467 : : * StrategyInitialize -- initialize the buffer cache replacement
468 : : * strategy.
469 : : *
470 : : * Assumes: All of the buffers are already built into a linked list.
471 : : * Only called by postmaster and only during initialization.
472 : : */
473 : : void
7458 JanWieck@Yahoo.com 474 : 898 : StrategyInitialize(bool init)
475 : : {
476 : : bool found;
477 : :
478 : : /*
479 : : * Initialize the shared buffer lookup hashtable.
480 : : *
481 : : * Since we can't tolerate running out of lookup table entries, we must be
482 : : * sure to specify an adequate table size here. The maximum steady-state
483 : : * usage is of course NBuffers entries, but BufferAlloc() tries to insert
484 : : * a new entry before deleting the old. In principle this could be
485 : : * happening in each partition concurrently, so we could need as many as
486 : : * NBuffers + NUM_BUFFER_PARTITIONS entries.
487 : : */
6475 tgl@sss.pgh.pa.us 488 : 898 : InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
489 : :
490 : : /*
491 : : * Get or create the shared strategy control block
492 : : */
7458 JanWieck@Yahoo.com 493 : 898 : StrategyControl = (BufferStrategyControl *)
7300 tgl@sss.pgh.pa.us 494 : 898 : ShmemInitStruct("Buffer Strategy Status",
495 : : sizeof(BufferStrategyControl),
496 : : &found);
497 : :
7458 JanWieck@Yahoo.com 498 [ + - ]: 898 : if (!found)
499 : : {
500 : : /*
501 : : * Only done once, usually in postmaster
502 : : */
503 [ - + ]: 898 : Assert(init);
504 : :
3489 rhaas@postgresql.org 505 : 898 : SpinLockInit(&StrategyControl->buffer_strategy_lock);
506 : :
507 : : /*
508 : : * Grab the whole linked list of free buffers for our strategy. We
509 : : * assume it was previously set up by InitBufferPool().
510 : : */
6981 tgl@sss.pgh.pa.us 511 : 898 : StrategyControl->firstFreeBuffer = 0;
512 : 898 : StrategyControl->lastFreeBuffer = NBuffers - 1;
513 : :
514 : : /* Initialize the clock sweep pointer */
3398 andres@anarazel.de 515 : 898 : pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
516 : :
517 : : /* Clear statistics */
6046 tgl@sss.pgh.pa.us 518 : 898 : StrategyControl->completePasses = 0;
3398 andres@anarazel.de 519 : 898 : pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
520 : :
521 : : /* No pending notification */
522 : 898 : StrategyControl->bgwprocno = -1;
523 : : }
524 : : else
7458 JanWieck@Yahoo.com 525 [ # # ]:UBC 0 : Assert(!init);
7458 JanWieck@Yahoo.com 526 :CBC 898 : }
527 : :
528 : :
529 : : /* ----------------------------------------------------------------
530 : : * Backend-private buffer ring management
531 : : * ----------------------------------------------------------------
532 : : */
533 : :
534 : :
535 : : /*
536 : : * GetAccessStrategy -- create a BufferAccessStrategy object
537 : : *
538 : : * The object is allocated in the current memory context.
539 : : */
540 : : BufferAccessStrategy
6164 tgl@sss.pgh.pa.us 541 : 124338 : GetAccessStrategy(BufferAccessStrategyType btype)
542 : : {
543 : : int ring_size_kb;
544 : :
545 : : /*
546 : : * Select ring size to use. See buffer/README for rationales.
547 : : *
548 : : * Note: if you change the ring size for BAS_BULKREAD, see also
549 : : * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
550 : : */
551 [ - + + - : 124338 : switch (btype)
- ]
552 : : {
6164 tgl@sss.pgh.pa.us 553 :UBC 0 : case BAS_NORMAL:
554 : : /* if someone asks for NORMAL, just give 'em a "default" object */
555 : 0 : return NULL;
556 : :
6164 tgl@sss.pgh.pa.us 557 :CBC 68845 : case BAS_BULKREAD:
373 drowley@postgresql.o 558 : 68845 : ring_size_kb = 256;
6164 tgl@sss.pgh.pa.us 559 : 68845 : break;
5638 560 : 55493 : case BAS_BULKWRITE:
373 drowley@postgresql.o 561 : 55493 : ring_size_kb = 16 * 1024;
5638 tgl@sss.pgh.pa.us 562 : 55493 : break;
6164 tgl@sss.pgh.pa.us 563 :UBC 0 : case BAS_VACUUM:
8 tmunro@postgresql.or 564 :UNC 0 : ring_size_kb = 2048;
6164 tgl@sss.pgh.pa.us 565 :UBC 0 : break;
566 : :
567 : 0 : default:
568 [ # # ]: 0 : elog(ERROR, "unrecognized buffer access strategy: %d",
569 : : (int) btype);
570 : : return NULL; /* keep compiler quiet */
571 : : }
572 : :
373 drowley@postgresql.o 573 :CBC 124338 : return GetAccessStrategyWithSize(btype, ring_size_kb);
574 : : }
575 : :
576 : : /*
577 : : * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
578 : : * number of buffers equivalent to the passed in size.
579 : : *
580 : : * If the given ring size is 0, no BufferAccessStrategy will be created and
581 : : * the function will return NULL. ring_size_kb must not be negative.
582 : : */
583 : : BufferAccessStrategy
584 : 130064 : GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
585 : : {
586 : : int ring_buffers;
587 : : BufferAccessStrategy strategy;
588 : :
589 [ - + ]: 130064 : Assert(ring_size_kb >= 0);
590 : :
591 : : /* Figure out how many buffers ring_size_kb is */
592 : 130064 : ring_buffers = ring_size_kb / (BLCKSZ / 1024);
593 : :
594 : : /* 0 means unlimited, so no BufferAccessStrategy required */
595 [ + + ]: 130064 : if (ring_buffers == 0)
596 : 6 : return NULL;
597 : :
598 : : /* Cap to 1/8th of shared_buffers */
599 : 130058 : ring_buffers = Min(NBuffers / 8, ring_buffers);
600 : :
601 : : /* NBuffers should never be less than 16, so this shouldn't happen */
602 [ - + ]: 130058 : Assert(ring_buffers > 0);
603 : :
604 : : /* Allocate the object and initialize all elements to zeroes */
605 : : strategy = (BufferAccessStrategy)
6164 tgl@sss.pgh.pa.us 606 : 130058 : palloc0(offsetof(BufferAccessStrategyData, buffers) +
607 : : ring_buffers * sizeof(Buffer));
608 : :
609 : : /* Set fields that don't start out zero */
610 : 130058 : strategy->btype = btype;
373 drowley@postgresql.o 611 : 130058 : strategy->nbuffers = ring_buffers;
612 : :
6164 tgl@sss.pgh.pa.us 613 : 130058 : return strategy;
614 : : }
615 : :
616 : : /*
617 : : * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
618 : : * the ring
619 : : *
620 : : * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
621 : : * returning NULL with 0 size.
622 : : */
623 : : int
373 drowley@postgresql.o 624 : 9 : GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
625 : : {
626 [ - + ]: 9 : if (strategy == NULL)
373 drowley@postgresql.o 627 :UBC 0 : return 0;
628 : :
373 drowley@postgresql.o 629 :CBC 9 : return strategy->nbuffers;
630 : : }
631 : :
632 : : /*
633 : : * GetAccessStrategyPinLimit -- get cap of number of buffers that should be pinned
634 : : *
635 : : * When pinning extra buffers to look ahead, users of a ring-based strategy are
636 : : * in danger of pinning too much of the ring at once while performing look-ahead.
637 : : * For some strategies, that means "escaping" from the ring, and in others it
638 : : * means forcing dirty data to disk very frequently with associated WAL
639 : : * flushing. Since external code has no insight into any of that, allow
640 : : * individual strategy types to expose a clamp that should be applied when
641 : : * deciding on a maximum number of buffers to pin at once.
642 : : *
643 : : * Callers should combine this number with other relevant limits and take the
644 : : * minimum.
645 : : */
646 : : int
8 tmunro@postgresql.or 647 :GNC 314077 : GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
648 : : {
649 [ + + ]: 314077 : if (strategy == NULL)
650 : 297732 : return NBuffers;
651 : :
652 [ + + ]: 16345 : switch (strategy->btype)
653 : : {
654 : 9529 : case BAS_BULKREAD:
655 : :
656 : : /*
657 : : * Since BAS_BULKREAD uses StrategyRejectBuffer(), dirty buffers
658 : : * shouldn't be a problem and the caller is free to pin up to the
659 : : * entire ring at once.
660 : : */
661 : 9529 : return strategy->nbuffers;
662 : :
663 : 6816 : default:
664 : :
665 : : /*
666 : : * Tell caller not to pin more than half the buffers in the ring.
667 : : * This is a trade-off between look ahead distance and deferring
668 : : * writeback and associated WAL traffic.
669 : : */
670 : 6816 : return strategy->nbuffers / 2;
671 : : }
672 : : }
673 : :
674 : : /*
675 : : * FreeAccessStrategy -- release a BufferAccessStrategy object
676 : : *
677 : : * A simple pfree would do at the moment, but we would prefer that callers
678 : : * don't assume that much about the representation of BufferAccessStrategy.
679 : : */
680 : : void
6164 tgl@sss.pgh.pa.us 681 :CBC 118488 : FreeAccessStrategy(BufferAccessStrategy strategy)
682 : : {
683 : : /* don't crash if called on a "default" strategy */
684 [ + - ]: 118488 : if (strategy != NULL)
685 : 118488 : pfree(strategy);
686 : 118488 : }
687 : :
688 : : /*
689 : : * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
690 : : * ring is empty / not usable.
691 : : *
692 : : * The bufhdr spin lock is held on the returned buffer.
693 : : */
694 : : static BufferDesc *
2926 andres@anarazel.de 695 : 624813 : GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
696 : : {
697 : : BufferDesc *buf;
698 : : Buffer bufnum;
699 : : uint32 local_buf_state; /* to avoid repeated (de-)referencing */
700 : :
701 : :
702 : : /* Advance to next ring slot */
377 drowley@postgresql.o 703 [ + + ]: 624813 : if (++strategy->current >= strategy->nbuffers)
6164 tgl@sss.pgh.pa.us 704 : 17154 : strategy->current = 0;
705 : :
706 : : /*
707 : : * If the slot hasn't been filled yet, tell the caller to allocate a new
708 : : * buffer with the normal allocation strategy. He will then fill this
709 : : * slot by calling AddBufferToRing with the new buffer.
710 : : */
711 : 624813 : bufnum = strategy->buffers[strategy->current];
712 [ + + ]: 624813 : if (bufnum == InvalidBuffer)
713 : 389104 : return NULL;
714 : :
715 : : /*
716 : : * If the buffer is pinned we cannot use it under any circumstances.
717 : : *
718 : : * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
719 : : * since our own previous usage of the ring element would have left it
720 : : * there, but it might've been decremented by clock sweep since then). A
721 : : * higher usage_count indicates someone else has touched the buffer, so we
722 : : * shouldn't re-use it.
723 : : */
3363 andres@anarazel.de 724 : 235709 : buf = GetBufferDescriptor(bufnum - 1);
2926 725 : 235709 : local_buf_state = LockBufHdr(buf);
726 [ + + ]: 235709 : if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
727 [ + + ]: 231242 : && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
728 : : {
729 : 226505 : *buf_state = local_buf_state;
6164 tgl@sss.pgh.pa.us 730 : 226505 : return buf;
731 : : }
2926 andres@anarazel.de 732 : 9204 : UnlockBufHdr(buf, local_buf_state);
733 : :
734 : : /*
735 : : * Tell caller to allocate a new buffer with the normal allocation
736 : : * strategy. He'll then replace this ring element via AddBufferToRing.
737 : : */
6164 tgl@sss.pgh.pa.us 738 : 9204 : return NULL;
739 : : }
740 : :
741 : : /*
742 : : * AddBufferToRing -- add a buffer to the buffer ring
743 : : *
744 : : * Caller must hold the buffer header spinlock on the buffer. Since this
745 : : * is called with the spinlock held, it had better be quite cheap.
746 : : */
747 : : static void
3072 rhaas@postgresql.org 748 : 398308 : AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
749 : : {
6164 tgl@sss.pgh.pa.us 750 : 398308 : strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
751 : 398308 : }
752 : :
753 : : /*
754 : : * Utility function returning the IOContext of a given BufferAccessStrategy's
755 : : * strategy ring.
756 : : */
757 : : IOContext
430 andres@anarazel.de 758 : 51899727 : IOContextForStrategy(BufferAccessStrategy strategy)
759 : : {
760 [ + + ]: 51899727 : if (!strategy)
761 : 50308639 : return IOCONTEXT_NORMAL;
762 : :
763 [ - + + + : 1591088 : switch (strategy->btype)
- ]
764 : : {
430 andres@anarazel.de 765 :UBC 0 : case BAS_NORMAL:
766 : :
767 : : /*
768 : : * Currently, GetAccessStrategy() returns NULL for
769 : : * BufferAccessStrategyType BAS_NORMAL, so this case is
770 : : * unreachable.
771 : : */
772 : 0 : pg_unreachable();
773 : : return IOCONTEXT_NORMAL;
430 andres@anarazel.de 774 :CBC 940677 : case BAS_BULKREAD:
775 : 940677 : return IOCONTEXT_BULKREAD;
776 : 248550 : case BAS_BULKWRITE:
777 : 248550 : return IOCONTEXT_BULKWRITE;
778 : 401861 : case BAS_VACUUM:
779 : 401861 : return IOCONTEXT_VACUUM;
780 : : }
781 : :
430 andres@anarazel.de 782 [ # # ]:UBC 0 : elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
783 : : pg_unreachable();
784 : : }
785 : :
786 : : /*
787 : : * StrategyRejectBuffer -- consider rejecting a dirty buffer
788 : : *
789 : : * When a nondefault strategy is used, the buffer manager calls this function
790 : : * when it turns out that the buffer selected by StrategyGetBuffer needs to
791 : : * be written out and doing so would require flushing WAL too. This gives us
792 : : * a chance to choose a different victim.
793 : : *
794 : : * Returns true if buffer manager should ask for a new victim, and false
795 : : * if this buffer should be written and re-used.
796 : : */
797 : : bool
430 andres@anarazel.de 798 :CBC 6524 : StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
799 : : {
800 : : /* We only do this in bulkread mode */
6164 tgl@sss.pgh.pa.us 801 [ + + ]: 6524 : if (strategy->btype != BAS_BULKREAD)
802 : 2150 : return false;
803 : :
804 : : /* Don't muck with behavior of normal buffer-replacement strategy */
430 andres@anarazel.de 805 [ + + - + ]: 8494 : if (!from_ring ||
2489 tgl@sss.pgh.pa.us 806 : 4120 : strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
6164 807 : 254 : return false;
808 : :
809 : : /*
810 : : * Remove the dirty buffer from the ring; necessary to prevent infinite
811 : : * loop if all ring members are dirty.
812 : : */
813 : 4120 : strategy->buffers[strategy->current] = InvalidBuffer;
814 : :
815 : 4120 : return true;
816 : : }
|