Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * bufmgr.c
4 : : * buffer manager interface routines
5 : : *
6 : : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/buffer/bufmgr.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : /*
16 : : * Principal entry points:
17 : : *
18 : : * ReadBuffer() -- find or create a buffer holding the requested page,
19 : : * and pin it so that no one can destroy it while this process
20 : : * is using it.
21 : : *
22 : : * StartReadBuffer() -- as above, with separate wait step
23 : : * StartReadBuffers() -- multiple block version
24 : : * WaitReadBuffers() -- second step of above
25 : : *
26 : : * ReleaseBuffer() -- unpin a buffer
27 : : *
28 : : * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
29 : : * The disk write is delayed until buffer replacement or checkpoint.
30 : : *
31 : : * See also these files:
32 : : * freelist.c -- chooses victim for buffer replacement
33 : : * buf_table.c -- manages the buffer lookup table
34 : : */
35 : : #include "postgres.h"
36 : :
37 : : #include <sys/file.h>
38 : : #include <unistd.h>
39 : :
40 : : #include "access/tableam.h"
41 : : #include "access/xloginsert.h"
42 : : #include "access/xlogutils.h"
43 : : #include "catalog/storage.h"
44 : : #include "catalog/storage_xlog.h"
45 : : #include "executor/instrument.h"
46 : : #include "lib/binaryheap.h"
47 : : #include "miscadmin.h"
48 : : #include "pg_trace.h"
49 : : #include "pgstat.h"
50 : : #include "postmaster/bgwriter.h"
51 : : #include "storage/buf_internals.h"
52 : : #include "storage/bufmgr.h"
53 : : #include "storage/fd.h"
54 : : #include "storage/ipc.h"
55 : : #include "storage/lmgr.h"
56 : : #include "storage/proc.h"
57 : : #include "storage/smgr.h"
58 : : #include "storage/standby.h"
59 : : #include "utils/memdebug.h"
60 : : #include "utils/ps_status.h"
61 : : #include "utils/rel.h"
62 : : #include "utils/resowner.h"
63 : : #include "utils/timestamp.h"
64 : :
65 : :
66 : : /* Note: these two macros only work on shared buffers, not local ones! */
67 : : #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
68 : : #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
69 : :
70 : : /* Note: this macro only works on local buffers, not shared ones! */
71 : : #define LocalBufHdrGetBlock(bufHdr) \
72 : : LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
73 : :
74 : : /* Bits in SyncOneBuffer's return value */
75 : : #define BUF_WRITTEN 0x01
76 : : #define BUF_REUSABLE 0x02
77 : :
78 : : #define RELS_BSEARCH_THRESHOLD 20
79 : :
80 : : /*
81 : : * This is the size (in the number of blocks) above which we scan the
82 : : * entire buffer pool to remove the buffers for all the pages of relation
83 : : * being dropped. For the relations with size below this threshold, we find
84 : : * the buffers by doing lookups in BufMapping table.
85 : : */
86 : : #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
87 : :
88 : : typedef struct PrivateRefCountEntry
89 : : {
90 : : Buffer buffer;
91 : : int32 refcount;
92 : : } PrivateRefCountEntry;
93 : :
94 : : /* 64 bytes, about the size of a cache line on common systems */
95 : : #define REFCOUNT_ARRAY_ENTRIES 8
96 : :
97 : : /*
98 : : * Status of buffers to checkpoint for a particular tablespace, used
99 : : * internally in BufferSync.
100 : : */
101 : : typedef struct CkptTsStatus
102 : : {
103 : : /* oid of the tablespace */
104 : : Oid tsId;
105 : :
106 : : /*
107 : : * Checkpoint progress for this tablespace. To make progress comparable
108 : : * between tablespaces the progress is, for each tablespace, measured as a
109 : : * number between 0 and the total number of to-be-checkpointed pages. Each
110 : : * page checkpointed in this tablespace increments this space's progress
111 : : * by progress_slice.
112 : : */
113 : : float8 progress;
114 : : float8 progress_slice;
115 : :
116 : : /* number of to-be checkpointed pages in this tablespace */
117 : : int num_to_scan;
118 : : /* already processed pages in this tablespace */
119 : : int num_scanned;
120 : :
121 : : /* current offset in CkptBufferIds for this tablespace */
122 : : int index;
123 : : } CkptTsStatus;
124 : :
125 : : /*
126 : : * Type for array used to sort SMgrRelations
127 : : *
128 : : * FlushRelationsAllBuffers shares the same comparator function with
129 : : * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
130 : : * compatible.
131 : : */
132 : : typedef struct SMgrSortArray
133 : : {
134 : : RelFileLocator rlocator; /* This must be the first member */
135 : : SMgrRelation srel;
136 : : } SMgrSortArray;
137 : :
138 : : /* GUC variables */
139 : : bool zero_damaged_pages = false;
140 : : int bgwriter_lru_maxpages = 100;
141 : : double bgwriter_lru_multiplier = 2.0;
142 : : bool track_io_timing = false;
143 : :
144 : : /*
145 : : * How many buffers PrefetchBuffer callers should try to stay ahead of their
146 : : * ReadBuffer calls by. Zero means "never prefetch". This value is only used
147 : : * for buffers not belonging to tablespaces that have their
148 : : * effective_io_concurrency parameter set.
149 : : */
150 : : int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
151 : :
152 : : /*
153 : : * Like effective_io_concurrency, but used by maintenance code paths that might
154 : : * benefit from a higher setting because they work on behalf of many sessions.
155 : : * Overridden by the tablespace setting of the same name.
156 : : */
157 : : int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
158 : :
159 : : /*
160 : : * Limit on how many blocks should be handled in single I/O operations.
161 : : * StartReadBuffers() callers should respect it, as should other operations
162 : : * that call smgr APIs directly.
163 : : */
164 : : int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
165 : :
166 : : /*
167 : : * GUC variables about triggering kernel writeback for buffers written; OS
168 : : * dependent defaults are set via the GUC mechanism.
169 : : */
170 : : int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
171 : : int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
172 : : int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
173 : :
174 : : /* local state for LockBufferForCleanup */
175 : : static BufferDesc *PinCountWaitBuf = NULL;
176 : :
177 : : /*
178 : : * Backend-Private refcount management:
179 : : *
180 : : * Each buffer also has a private refcount that keeps track of the number of
181 : : * times the buffer is pinned in the current process. This is so that the
182 : : * shared refcount needs to be modified only once if a buffer is pinned more
183 : : * than once by an individual backend. It's also used to check that no buffers
184 : : * are still pinned at the end of transactions and when exiting.
185 : : *
186 : : *
187 : : * To avoid - as we used to - requiring an array with NBuffers entries to keep
188 : : * track of local buffers, we use a small sequentially searched array
189 : : * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
190 : : * keep track of backend local pins.
191 : : *
192 : : * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
193 : : * refcounts are kept track of in the array; after that, new array entries
194 : : * displace old ones into the hash table. That way a frequently used entry
195 : : * can't get "stuck" in the hashtable while infrequent ones clog the array.
196 : : *
197 : : * Note that in most scenarios the number of pinned buffers will not exceed
198 : : * REFCOUNT_ARRAY_ENTRIES.
199 : : *
200 : : *
201 : : * To enter a buffer into the refcount tracking mechanism first reserve a free
202 : : * entry using ReservePrivateRefCountEntry() and then later, if necessary,
203 : : * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
204 : : * memory allocations in NewPrivateRefCountEntry() which can be important
205 : : * because in some scenarios it's called with a spinlock held...
206 : : */
207 : : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
208 : : static HTAB *PrivateRefCountHash = NULL;
209 : : static int32 PrivateRefCountOverflowed = 0;
210 : : static uint32 PrivateRefCountClock = 0;
211 : : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
212 : :
213 : : static void ReservePrivateRefCountEntry(void);
214 : : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
215 : : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
216 : : static inline int32 GetPrivateRefCount(Buffer buffer);
217 : : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
218 : :
219 : : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
220 : : static void ResOwnerReleaseBufferIO(Datum res);
221 : : static char *ResOwnerPrintBufferIO(Datum res);
222 : : static void ResOwnerReleaseBufferPin(Datum res);
223 : : static char *ResOwnerPrintBufferPin(Datum res);
224 : :
225 : : const ResourceOwnerDesc buffer_io_resowner_desc =
226 : : {
227 : : .name = "buffer io",
228 : : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
229 : : .release_priority = RELEASE_PRIO_BUFFER_IOS,
230 : : .ReleaseResource = ResOwnerReleaseBufferIO,
231 : : .DebugPrint = ResOwnerPrintBufferIO
232 : : };
233 : :
234 : : const ResourceOwnerDesc buffer_pin_resowner_desc =
235 : : {
236 : : .name = "buffer pin",
237 : : .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
238 : : .release_priority = RELEASE_PRIO_BUFFER_PINS,
239 : : .ReleaseResource = ResOwnerReleaseBufferPin,
240 : : .DebugPrint = ResOwnerPrintBufferPin
241 : : };
242 : :
243 : : /*
244 : : * Ensure that the PrivateRefCountArray has sufficient space to store one more
245 : : * entry. This has to be called before using NewPrivateRefCountEntry() to fill
246 : : * a new entry - but it's perfectly fine to not use a reserved entry.
247 : : */
248 : : static void
3373 andres@anarazel.de 249 :CBC 54826905 : ReservePrivateRefCountEntry(void)
250 : : {
251 : : /* Already reserved (or freed), nothing to do */
252 [ + + ]: 54826905 : if (ReservedRefCountEntry != NULL)
253 : 51196689 : return;
254 : :
255 : : /*
256 : : * First search for a free entry the array, that'll be sufficient in the
257 : : * majority of cases.
258 : : */
259 : : {
260 : : int i;
261 : :
262 [ + + ]: 8711428 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
263 : : {
264 : : PrivateRefCountEntry *res;
265 : :
266 : 8627384 : res = &PrivateRefCountArray[i];
267 : :
268 [ + + ]: 8627384 : if (res->buffer == InvalidBuffer)
269 : : {
270 : 3546172 : ReservedRefCountEntry = res;
271 : 3546172 : return;
272 : : }
273 : : }
274 : : }
275 : :
276 : : /*
277 : : * No luck. All array entries are full. Move one array entry into the hash
278 : : * table.
279 : : */
280 : : {
281 : : /*
282 : : * Move entry from the current clock position in the array into the
283 : : * hashtable. Use that slot.
284 : : */
285 : : PrivateRefCountEntry *hashent;
286 : : bool found;
287 : :
288 : : /* select victim slot */
3249 bruce@momjian.us 289 : 84044 : ReservedRefCountEntry =
3373 andres@anarazel.de 290 : 84044 : &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
291 : :
292 : : /* Better be used, otherwise we shouldn't get here. */
293 [ - + ]: 84044 : Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
294 : :
295 : : /* enter victim array entry into hashtable */
296 : 84044 : hashent = hash_search(PrivateRefCountHash,
433 peter@eisentraut.org 297 : 84044 : &(ReservedRefCountEntry->buffer),
298 : : HASH_ENTER,
299 : : &found);
3373 andres@anarazel.de 300 [ - + ]: 84044 : Assert(!found);
301 : 84044 : hashent->refcount = ReservedRefCountEntry->refcount;
302 : :
303 : : /* clear the now free array slot */
304 : 84044 : ReservedRefCountEntry->buffer = InvalidBuffer;
305 : 84044 : ReservedRefCountEntry->refcount = 0;
306 : :
307 : 84044 : PrivateRefCountOverflowed++;
308 : : }
309 : : }
310 : :
311 : : /*
312 : : * Fill a previously reserved refcount entry.
313 : : */
314 : : static PrivateRefCountEntry *
315 : 49607311 : NewPrivateRefCountEntry(Buffer buffer)
316 : : {
317 : : PrivateRefCountEntry *res;
318 : :
319 : : /* only allowed to be called when a reservation has been made */
320 [ - + ]: 49607311 : Assert(ReservedRefCountEntry != NULL);
321 : :
322 : : /* use up the reserved entry */
323 : 49607311 : res = ReservedRefCountEntry;
324 : 49607311 : ReservedRefCountEntry = NULL;
325 : :
326 : : /* and fill it */
327 : 49607311 : res->buffer = buffer;
328 : 49607311 : res->refcount = 0;
329 : :
330 : 49607311 : return res;
331 : : }
332 : :
333 : : /*
334 : : * Return the PrivateRefCount entry for the passed buffer.
335 : : *
336 : : * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
337 : : * do_move is true, and the entry resides in the hashtable the entry is
338 : : * optimized for frequent access by moving it to the array.
339 : : */
340 : : static PrivateRefCountEntry *
341 : 450035745 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
342 : : {
343 : : PrivateRefCountEntry *res;
344 : : int i;
345 : :
3515 346 [ - + ]: 450035745 : Assert(BufferIsValid(buffer));
347 [ - + ]: 450035745 : Assert(!BufferIsLocal(buffer));
348 : :
349 : : /*
350 : : * First search for references in the array, that'll be sufficient in the
351 : : * majority of cases.
352 : : */
353 [ + + ]: 1274208849 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
354 : : {
355 : 1223566631 : res = &PrivateRefCountArray[i];
356 : :
357 [ + + ]: 1223566631 : if (res->buffer == buffer)
358 : 399393527 : return res;
359 : : }
360 : :
361 : : /*
362 : : * By here we know that the buffer, if already pinned, isn't residing in
363 : : * the array.
364 : : *
365 : : * Only look up the buffer in the hashtable if we've previously overflowed
366 : : * into it.
367 : : */
3373 368 [ + + ]: 50642218 : if (PrivateRefCountOverflowed == 0)
369 : 49880809 : return NULL;
370 : :
433 peter@eisentraut.org 371 : 761409 : res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
372 : :
3373 andres@anarazel.de 373 [ + + ]: 761409 : if (res == NULL)
374 : 245327 : return NULL;
375 [ + + ]: 516082 : else if (!do_move)
376 : : {
377 : : /* caller doesn't want us to move the hash entry into the array */
378 : 508507 : return res;
379 : : }
380 : : else
381 : : {
382 : : /* move buffer from hashtable into the free array slot */
383 : : bool found;
384 : : PrivateRefCountEntry *free;
385 : :
386 : : /* Ensure there's a free array slot */
387 : 7575 : ReservePrivateRefCountEntry();
388 : :
389 : : /* Use up the reserved slot */
390 [ - + ]: 7575 : Assert(ReservedRefCountEntry != NULL);
391 : 7575 : free = ReservedRefCountEntry;
392 : 7575 : ReservedRefCountEntry = NULL;
393 [ - + ]: 7575 : Assert(free->buffer == InvalidBuffer);
394 : :
395 : : /* and fill it */
396 : 7575 : free->buffer = buffer;
397 : 7575 : free->refcount = res->refcount;
398 : :
399 : : /* delete from hashtable */
433 peter@eisentraut.org 400 : 7575 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
3373 andres@anarazel.de 401 [ - + ]: 7575 : Assert(found);
402 [ - + ]: 7575 : Assert(PrivateRefCountOverflowed > 0);
403 : 7575 : PrivateRefCountOverflowed--;
404 : :
405 : 7575 : return free;
406 : : }
407 : : }
408 : :
409 : : /*
410 : : * Returns how many times the passed buffer is pinned by this backend.
411 : : *
412 : : * Only works for shared memory buffers!
413 : : */
414 : : static inline int32
3515 415 : 328390583 : GetPrivateRefCount(Buffer buffer)
416 : : {
417 : : PrivateRefCountEntry *ref;
418 : :
419 [ - + ]: 328390583 : Assert(BufferIsValid(buffer));
420 [ - + ]: 328390583 : Assert(!BufferIsLocal(buffer));
421 : :
422 : : /*
423 : : * Not moving the entry - that's ok for the current users, but we might
424 : : * want to change this one day.
425 : : */
3373 426 : 328390583 : ref = GetPrivateRefCountEntry(buffer, false);
427 : :
3515 428 [ + + ]: 328390583 : if (ref == NULL)
429 : 518825 : return 0;
430 : 327871758 : return ref->refcount;
431 : : }
432 : :
433 : : /*
434 : : * Release resources used to track the reference count of a buffer which we no
435 : : * longer have pinned and don't want to pin again immediately.
436 : : */
437 : : static void
438 : 49607311 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
439 : : {
440 [ - + ]: 49607311 : Assert(ref->refcount == 0);
441 : :
442 [ + - + + ]: 49607311 : if (ref >= &PrivateRefCountArray[0] &&
443 : : ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
444 : : {
445 : 49530842 : ref->buffer = InvalidBuffer;
446 : :
447 : : /*
448 : : * Mark the just used entry as reserved - in many scenarios that
449 : : * allows us to avoid ever having to search the array/hash for free
450 : : * entries.
451 : : */
3373 452 : 49530842 : ReservedRefCountEntry = ref;
453 : : }
454 : : else
455 : : {
456 : : bool found;
3249 bruce@momjian.us 457 : 76469 : Buffer buffer = ref->buffer;
458 : :
433 peter@eisentraut.org 459 : 76469 : hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
3515 andres@anarazel.de 460 [ - + ]: 76469 : Assert(found);
461 [ - + ]: 76469 : Assert(PrivateRefCountOverflowed > 0);
462 : 76469 : PrivateRefCountOverflowed--;
463 : : }
464 : 49607311 : }
465 : :
466 : : /*
467 : : * BufferIsPinned
468 : : * True iff the buffer is pinned (also checks for valid buffer number).
469 : : *
470 : : * NOTE: what we check here is that *this* backend holds a pin on
471 : : * the buffer. We do not care whether some other backend does.
472 : : */
473 : : #define BufferIsPinned(bufnum) \
474 : : ( \
475 : : !BufferIsValid(bufnum) ? \
476 : : false \
477 : : : \
478 : : BufferIsLocal(bufnum) ? \
479 : : (LocalRefCount[-(bufnum) - 1] > 0) \
480 : : : \
481 : : (GetPrivateRefCount(bufnum) > 0) \
482 : : )
483 : :
484 : :
485 : : static Buffer ReadBuffer_common(Relation rel,
486 : : SMgrRelation smgr, char smgr_persistence,
487 : : ForkNumber forkNum, BlockNumber blockNum,
488 : : ReadBufferMode mode, BufferAccessStrategy strategy);
489 : : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
490 : : ForkNumber fork,
491 : : BufferAccessStrategy strategy,
492 : : uint32 flags,
493 : : uint32 extend_by,
494 : : BlockNumber extend_upto,
495 : : Buffer *buffers,
496 : : uint32 *extended_by);
497 : : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
498 : : ForkNumber fork,
499 : : BufferAccessStrategy strategy,
500 : : uint32 flags,
501 : : uint32 extend_by,
502 : : BlockNumber extend_upto,
503 : : Buffer *buffers,
504 : : uint32 *extended_by);
505 : : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
506 : : static void PinBuffer_Locked(BufferDesc *buf);
507 : : static void UnpinBuffer(BufferDesc *buf);
508 : : static void UnpinBufferNoOwner(BufferDesc *buf);
509 : : static void BufferSync(int flags);
510 : : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
511 : : static int SyncOneBuffer(int buf_id, bool skip_recently_used,
512 : : WritebackContext *wb_context);
513 : : static void WaitIO(BufferDesc *buf);
514 : : static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
515 : : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
516 : : uint32 set_flag_bits, bool forget_owner);
517 : : static void AbortBufferIO(Buffer buffer);
518 : : static void shared_buffer_write_error_callback(void *arg);
519 : : static void local_buffer_write_error_callback(void *arg);
520 : : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
521 : : char relpersistence,
522 : : ForkNumber forkNum,
523 : : BlockNumber blockNum,
524 : : BufferAccessStrategy strategy,
525 : : bool *foundPtr, IOContext io_context);
526 : : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
527 : : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
528 : : IOObject io_object, IOContext io_context);
529 : : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
530 : : ForkNumber forkNum,
531 : : BlockNumber nForkBlock,
532 : : BlockNumber firstDelBlock);
533 : : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
534 : : RelFileLocator dstlocator,
535 : : ForkNumber forkNum, bool permanent);
536 : : static void AtProcExit_Buffers(int code, Datum arg);
537 : : static void CheckForBufferLeaks(void);
538 : : static int rlocator_comparator(const void *p1, const void *p2);
539 : : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
540 : : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
541 : : static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
542 : :
543 : :
544 : : /*
545 : : * Implementation of PrefetchBuffer() for shared buffers.
546 : : */
547 : : PrefetchBufferResult
1467 tmunro@postgresql.or 548 : 766779 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
549 : : ForkNumber forkNum,
550 : : BlockNumber blockNum)
551 : : {
552 : 766779 : PrefetchBufferResult result = {InvalidBuffer, false};
553 : : BufferTag newTag; /* identity of requested block */
554 : : uint32 newHash; /* hash value for newTag */
555 : : LWLock *newPartitionLock; /* buffer partition lock for it */
556 : : int buf_id;
557 : :
558 [ - + ]: 766779 : Assert(BlockNumberIsValid(blockNum));
559 : :
560 : : /* create a tag so we can lookup the buffer */
627 rhaas@postgresql.org 561 : 766779 : InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
562 : : forkNum, blockNum);
563 : :
564 : : /* determine its hash code and partition lock ID */
1467 tmunro@postgresql.or 565 : 766779 : newHash = BufTableHashCode(&newTag);
566 : 766779 : newPartitionLock = BufMappingPartitionLock(newHash);
567 : :
568 : : /* see if the block is in the buffer pool already */
569 : 766779 : LWLockAcquire(newPartitionLock, LW_SHARED);
570 : 766779 : buf_id = BufTableLookup(&newTag, newHash);
571 : 766779 : LWLockRelease(newPartitionLock);
572 : :
573 : : /* If not in buffers, initiate prefetch */
574 [ + + ]: 766779 : if (buf_id < 0)
575 : : {
576 : : #ifdef USE_PREFETCH
577 : : /*
578 : : * Try to initiate an asynchronous read. This returns false in
579 : : * recovery if the relation file doesn't exist.
580 : : */
372 581 [ + + + - ]: 274567 : if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
120 tmunro@postgresql.or 582 :GNC 137173 : smgrprefetch(smgr_reln, forkNum, blockNum, 1))
583 : : {
1467 tmunro@postgresql.or 584 :CBC 137173 : result.initiated_io = true;
585 : : }
586 : : #endif /* USE_PREFETCH */
587 : : }
588 : : else
589 : : {
590 : : /*
591 : : * Report the buffer it was in at that time. The caller may be able
592 : : * to avoid a buffer table lookup, but it's not pinned and it must be
593 : : * rechecked!
594 : : */
595 : 629385 : result.recent_buffer = buf_id + 1;
596 : : }
597 : :
598 : : /*
599 : : * If the block *is* in buffers, we do nothing. This is not really ideal:
600 : : * the block might be just about to be evicted, which would be stupid
601 : : * since we know we are going to need it soon. But the only easy answer
602 : : * is to bump the usage_count, which does not seem like a great solution:
603 : : * when the caller does ultimately touch the block, usage_count would get
604 : : * bumped again, resulting in too much favoritism for blocks that are
605 : : * involved in a prefetch sequence. A real fix would involve some
606 : : * additional per-buffer state, and it's not clear that there's enough of
607 : : * a problem to justify that.
608 : : */
609 : :
610 : 766779 : return result;
611 : : }
612 : :
613 : : /*
614 : : * PrefetchBuffer -- initiate asynchronous read of a block of a relation
615 : : *
616 : : * This is named by analogy to ReadBuffer but doesn't actually allocate a
617 : : * buffer. Instead it tries to ensure that a future ReadBuffer for the given
618 : : * block will not be delayed by the I/O. Prefetching is optional.
619 : : *
620 : : * There are three possible outcomes:
621 : : *
622 : : * 1. If the block is already cached, the result includes a valid buffer that
623 : : * could be used by the caller to avoid the need for a later buffer lookup, but
624 : : * it's not pinned, so the caller must recheck it.
625 : : *
626 : : * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
627 : : * true. Currently there is no way to know if the data was already cached by
628 : : * the kernel and therefore didn't really initiate I/O, and no way to know when
629 : : * the I/O completes other than using synchronous ReadBuffer().
630 : : *
631 : : * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
632 : : * USE_PREFETCH is not defined (this build doesn't support prefetching due to
633 : : * lack of a kernel facility), direct I/O is enabled, or the underlying
634 : : * relation file wasn't found and we are in recovery. (If the relation file
635 : : * wasn't found and we are not in recovery, an error is raised).
636 : : */
637 : : PrefetchBufferResult
5571 tgl@sss.pgh.pa.us 638 : 196025 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
639 : : {
640 [ - + ]: 196025 : Assert(RelationIsValid(reln));
641 [ - + ]: 196025 : Assert(BlockNumberIsValid(blockNum));
642 : :
4871 rhaas@postgresql.org 643 [ + + ]: 196025 : if (RelationUsesLocalBuffers(reln))
644 : : {
645 : : /* see comments in ReadBufferExtended */
5493 tgl@sss.pgh.pa.us 646 [ + - - + ]: 3100 : if (RELATION_IS_OTHER_TEMP(reln))
5493 tgl@sss.pgh.pa.us 647 [ # # ]:UBC 0 : ereport(ERROR,
648 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
649 : : errmsg("cannot access temporary tables of other sessions")));
650 : :
651 : : /* pass it off to localbuf.c */
1007 tgl@sss.pgh.pa.us 652 :CBC 3100 : return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
653 : : }
654 : : else
655 : : {
656 : : /* pass it to the shared buffer version */
657 : 192925 : return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
658 : : }
659 : : }
660 : :
661 : : /*
662 : : * ReadRecentBuffer -- try to pin a block in a recently observed buffer
663 : : *
664 : : * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
665 : : * successful. Return true if the buffer is valid and still has the expected
666 : : * tag. In that case, the buffer is pinned and the usage count is bumped.
667 : : */
668 : : bool
648 rhaas@postgresql.org 669 : 518831 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
670 : : Buffer recent_buffer)
671 : : {
672 : : BufferDesc *bufHdr;
673 : : BufferTag tag;
674 : : uint32 buf_state;
675 : : bool have_private_ref;
676 : :
1102 tmunro@postgresql.or 677 [ - + ]: 518831 : Assert(BufferIsValid(recent_buffer));
678 : :
158 heikki.linnakangas@i 679 :GNC 518831 : ResourceOwnerEnlarge(CurrentResourceOwner);
1102 tmunro@postgresql.or 680 :CBC 518831 : ReservePrivateRefCountEntry();
627 rhaas@postgresql.org 681 : 518831 : InitBufferTag(&tag, &rlocator, forkNum, blockNum);
682 : :
1102 tmunro@postgresql.or 683 [ - + ]: 518831 : if (BufferIsLocal(recent_buffer))
684 : : {
629 heikki.linnakangas@i 685 :UBC 0 : int b = -recent_buffer - 1;
686 : :
687 : 0 : bufHdr = GetLocalBufferDescriptor(b);
1102 tmunro@postgresql.or 688 : 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
689 : :
690 : : /* Is it still valid and holding the right tag? */
627 rhaas@postgresql.org 691 [ # # # # ]: 0 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
692 : : {
375 andres@anarazel.de 693 : 0 : PinLocalBuffer(bufHdr, true);
694 : :
738 tmunro@postgresql.or 695 : 0 : pgBufferUsage.local_blks_hit++;
696 : :
1102 697 : 0 : return true;
698 : : }
699 : : }
700 : : else
701 : : {
1102 tmunro@postgresql.or 702 :CBC 518831 : bufHdr = GetBufferDescriptor(recent_buffer - 1);
703 : 518831 : have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
704 : :
705 : : /*
706 : : * Do we already have this buffer pinned with a private reference? If
707 : : * so, it must be valid and it is safe to check the tag without
708 : : * locking. If not, we have to lock the header first and then check.
709 : : */
710 [ + + ]: 518831 : if (have_private_ref)
711 : 7 : buf_state = pg_atomic_read_u32(&bufHdr->state);
712 : : else
713 : 518824 : buf_state = LockBufHdr(bufHdr);
714 : :
627 rhaas@postgresql.org 715 [ + + + + ]: 518831 : if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
716 : : {
717 : : /*
718 : : * It's now safe to pin the buffer. We can't pin first and ask
719 : : * questions later, because it might confuse code paths like
720 : : * InvalidateBuffer() if we pinned a random non-matching buffer.
721 : : */
1102 tmunro@postgresql.or 722 [ - + ]: 516485 : if (have_private_ref)
1102 tmunro@postgresql.or 723 :UBC 0 : PinBuffer(bufHdr, NULL); /* bump pin count */
724 : : else
1102 tmunro@postgresql.or 725 :CBC 516485 : PinBuffer_Locked(bufHdr); /* pin for first time */
726 : :
738 727 : 516485 : pgBufferUsage.shared_blks_hit++;
728 : :
1102 729 : 516485 : return true;
730 : : }
731 : :
732 : : /* If we locked the header above, now unlock. */
733 [ + + ]: 2346 : if (!have_private_ref)
734 : 2339 : UnlockBufHdr(bufHdr, buf_state);
735 : : }
736 : :
737 : 2346 : return false;
738 : : }
739 : :
740 : : /*
741 : : * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
742 : : * fork with RBM_NORMAL mode and default strategy.
743 : : */
744 : : Buffer
5644 heikki.linnakangas@i 745 : 38016719 : ReadBuffer(Relation reln, BlockNumber blockNum)
746 : : {
747 : 38016719 : return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
748 : : }
749 : :
750 : : /*
751 : : * ReadBufferExtended -- returns a buffer containing the requested
752 : : * block of the requested relation. If the blknum
753 : : * requested is P_NEW, extend the relation file and
754 : : * allocate a new block. (Caller is responsible for
755 : : * ensuring that only one backend tries to extend a
756 : : * relation at the same time!)
757 : : *
758 : : * Returns: the buffer number for the buffer containing
759 : : * the block read. The returned buffer has been pinned.
760 : : * Does not return on error --- elog's instead.
761 : : *
762 : : * Assume when this function is called, that reln has been opened already.
763 : : *
764 : : * In RBM_NORMAL mode, the page is read from disk, and the page header is
765 : : * validated. An error is thrown if the page header is not valid. (But
766 : : * note that an all-zero page is considered "valid"; see
767 : : * PageIsVerifiedExtended().)
768 : : *
769 : : * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
770 : : * valid, the page is zeroed instead of throwing an error. This is intended
771 : : * for non-critical data, where the caller is prepared to repair errors.
772 : : *
773 : : * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
774 : : * filled with zeros instead of reading it from disk. Useful when the caller
775 : : * is going to fill the page from scratch, since this saves I/O and avoids
776 : : * unnecessary failure if the page-on-disk has corrupt page headers.
777 : : * The page is returned locked to ensure that the caller has a chance to
778 : : * initialize the page before it's made visible to others.
779 : : * Caution: do not use this mode to read a page that is beyond the relation's
780 : : * current physical EOF; that is likely to cause problems in md.c when
781 : : * the page is modified and written out. P_NEW is OK, though.
782 : : *
783 : : * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
784 : : * a cleanup-strength lock on the page.
785 : : *
786 : : * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
787 : : *
788 : : * If strategy is not NULL, a nondefault buffer access strategy is used.
789 : : * See buffer/README for details.
790 : : */
791 : : inline Buffer
792 : 46293649 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
793 : : ReadBufferMode mode, BufferAccessStrategy strategy)
794 : : {
795 : : Buffer buf;
796 : :
797 : : /*
798 : : * Reject attempts to read non-local temporary relations; we would be
799 : : * likely to get wrong data since we have no visibility into the owning
800 : : * session's local buffers.
801 : : */
5493 tgl@sss.pgh.pa.us 802 [ + + - + ]: 46293649 : if (RELATION_IS_OTHER_TEMP(reln))
5493 tgl@sss.pgh.pa.us 803 [ # # ]:UBC 0 : ereport(ERROR,
804 : : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
805 : : errmsg("cannot access temporary tables of other sessions")));
806 : :
807 : : /*
808 : : * Read the buffer, and update pgstat counters to reflect a cache hit or
809 : : * miss.
810 : : */
11 tmunro@postgresql.or 811 :GNC 46293649 : buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
812 : : forkNum, blockNum, mode, strategy);
813 : :
5785 heikki.linnakangas@i 814 :CBC 46293633 : return buf;
815 : : }
816 : :
817 : :
818 : : /*
819 : : * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
820 : : * a relcache entry for the relation.
821 : : *
822 : : * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
823 : : * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
824 : : * cannot be used for temporary relations (and making that work might be
825 : : * difficult, unless we only want to read temporary relations for our own
826 : : * ProcNumber).
827 : : */
828 : : Buffer
648 rhaas@postgresql.org 829 : 3345591 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
830 : : BlockNumber blockNum, ReadBufferMode mode,
831 : : BufferAccessStrategy strategy, bool permanent)
832 : : {
42 heikki.linnakangas@i 833 :GNC 3345591 : SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
834 : :
11 tmunro@postgresql.or 835 [ + - ]: 3345591 : return ReadBuffer_common(NULL, smgr,
836 : : permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
837 : : forkNum, blockNum,
838 : : mode, strategy);
839 : : }
840 : :
841 : : /*
842 : : * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
843 : : */
844 : : Buffer
235 tmunro@postgresql.or 845 :CBC 43892 : ExtendBufferedRel(BufferManagerRelation bmr,
846 : : ForkNumber forkNum,
847 : : BufferAccessStrategy strategy,
848 : : uint32 flags)
849 : : {
850 : : Buffer buf;
375 andres@anarazel.de 851 : 43892 : uint32 extend_by = 1;
852 : :
235 tmunro@postgresql.or 853 : 43892 : ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854 : : &buf, &extend_by);
855 : :
375 andres@anarazel.de 856 : 43892 : return buf;
857 : : }
858 : :
859 : : /*
860 : : * Extend relation by multiple blocks.
861 : : *
862 : : * Tries to extend the relation by extend_by blocks. Depending on the
863 : : * availability of resources the relation may end up being extended by a
864 : : * smaller number of pages (unless an error is thrown, always by at least one
865 : : * page). *extended_by is updated to the number of pages the relation has been
866 : : * extended to.
867 : : *
868 : : * buffers needs to be an array that is at least extend_by long. Upon
869 : : * completion, the first extend_by array elements will point to a pinned
870 : : * buffer.
871 : : *
872 : : * If EB_LOCK_FIRST is part of flags, the first returned buffer is
873 : : * locked. This is useful for callers that want a buffer that is guaranteed to
874 : : * be empty.
875 : : */
876 : : BlockNumber
235 tmunro@postgresql.or 877 : 140189 : ExtendBufferedRelBy(BufferManagerRelation bmr,
878 : : ForkNumber fork,
879 : : BufferAccessStrategy strategy,
880 : : uint32 flags,
881 : : uint32 extend_by,
882 : : Buffer *buffers,
883 : : uint32 *extended_by)
884 : : {
885 [ - + ]: 140189 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886 [ - + - - ]: 140189 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
375 andres@anarazel.de 887 [ - + ]: 140189 : Assert(extend_by > 0);
888 : :
235 tmunro@postgresql.or 889 [ + - ]: 140189 : if (bmr.smgr == NULL)
890 : : {
891 : 140189 : bmr.smgr = RelationGetSmgr(bmr.rel);
892 : 140189 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
893 : : }
894 : :
895 : 140189 : return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
896 : : extend_by, InvalidBlockNumber,
897 : : buffers, extended_by);
898 : : }
899 : :
900 : : /*
901 : : * Extend the relation so it is at least extend_to blocks large, return buffer
902 : : * (extend_to - 1).
903 : : *
904 : : * This is useful for callers that want to write a specific page, regardless
905 : : * of the current size of the relation (e.g. useful for visibilitymap and for
906 : : * crash recovery).
907 : : */
908 : : Buffer
909 : 53605 : ExtendBufferedRelTo(BufferManagerRelation bmr,
910 : : ForkNumber fork,
911 : : BufferAccessStrategy strategy,
912 : : uint32 flags,
913 : : BlockNumber extend_to,
914 : : ReadBufferMode mode)
915 : : {
916 : : BlockNumber current_size;
375 andres@anarazel.de 917 : 53605 : uint32 extended_by = 0;
918 : 53605 : Buffer buffer = InvalidBuffer;
919 : : Buffer buffers[64];
920 : :
235 tmunro@postgresql.or 921 [ - + ]: 53605 : Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
922 [ + + - + ]: 53605 : Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
375 andres@anarazel.de 923 [ + - - + ]: 53605 : Assert(extend_to != InvalidBlockNumber && extend_to > 0);
924 : :
235 tmunro@postgresql.or 925 [ + + ]: 53605 : if (bmr.smgr == NULL)
926 : : {
927 : 5748 : bmr.smgr = RelationGetSmgr(bmr.rel);
928 : 5748 : bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
929 : : }
930 : :
931 : : /*
932 : : * If desired, create the file if it doesn't exist. If
933 : : * smgr_cached_nblocks[fork] is positive then it must exist, no need for
934 : : * an smgrexists call.
935 : : */
375 andres@anarazel.de 936 [ + + ]: 53605 : if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
235 tmunro@postgresql.or 937 [ + + ]: 5748 : (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
938 [ - + ]: 12 : bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
939 [ + + ]: 5736 : !smgrexists(bmr.smgr, fork))
940 : : {
941 : 5727 : LockRelationForExtension(bmr.rel, ExclusiveLock);
942 : :
943 : : /* recheck, fork might have been created concurrently */
944 [ + + ]: 5727 : if (!smgrexists(bmr.smgr, fork))
945 : 5716 : smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
946 : :
947 : 5727 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
948 : : }
949 : :
950 : : /*
951 : : * If requested, invalidate size cache, so that smgrnblocks asks the
952 : : * kernel.
953 : : */
375 andres@anarazel.de 954 [ + + ]: 53605 : if (flags & EB_CLEAR_SIZE_CACHE)
235 tmunro@postgresql.or 955 : 5748 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
956 : :
957 : : /*
958 : : * Estimate how many pages we'll need to extend by. This avoids acquiring
959 : : * unnecessarily many victim buffers.
960 : : */
961 : 53605 : current_size = smgrnblocks(bmr.smgr, fork);
962 : :
963 : : /*
964 : : * Since no-one else can be looking at the page contents yet, there is no
965 : : * difference between an exclusive lock and a cleanup-strength lock. Note
966 : : * that we pass the original mode to ReadBuffer_common() below, when
967 : : * falling back to reading the buffer to a concurrent relation extension.
968 : : */
366 andres@anarazel.de 969 [ + + + + ]: 53605 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
375 970 : 47502 : flags |= EB_LOCK_TARGET;
971 : :
972 [ + + ]: 109213 : while (current_size < extend_to)
973 : : {
974 : 55608 : uint32 num_pages = lengthof(buffers);
975 : : BlockNumber first_block;
976 : :
977 [ + + ]: 55608 : if ((uint64) current_size + num_pages > extend_to)
978 : 55542 : num_pages = extend_to - current_size;
979 : :
235 tmunro@postgresql.or 980 : 55608 : first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
981 : : num_pages, extend_to,
982 : : buffers, &extended_by);
983 : :
375 andres@anarazel.de 984 : 55608 : current_size = first_block + extended_by;
985 [ - + - - ]: 55608 : Assert(num_pages != 0 || current_size >= extend_to);
986 : :
208 peter@eisentraut.org 987 [ + + ]:GNC 117199 : for (uint32 i = 0; i < extended_by; i++)
988 : : {
375 andres@anarazel.de 989 [ + + ]:CBC 61591 : if (first_block + i != extend_to - 1)
990 : 7999 : ReleaseBuffer(buffers[i]);
991 : : else
992 : 53592 : buffer = buffers[i];
993 : : }
994 : : }
995 : :
996 : : /*
997 : : * It's possible that another backend concurrently extended the relation.
998 : : * In that case read the buffer.
999 : : *
1000 : : * XXX: Should we control this via a flag?
1001 : : */
1002 [ + + ]: 53605 : if (buffer == InvalidBuffer)
1003 : : {
1004 [ - + ]: 13 : Assert(extended_by == 0);
11 tmunro@postgresql.or 1005 :GNC 13 : buffer = ReadBuffer_common(bmr.rel, bmr.smgr, 0,
1006 : : fork, extend_to - 1, mode, strategy);
1007 : : }
1008 : :
375 andres@anarazel.de 1009 :CBC 53605 : return buffer;
1010 : : }
1011 : :
1012 : : /*
1013 : : * Zero a buffer and lock it, as part of the implementation of
1014 : : * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already
1015 : : * pinned. It does not have to be valid, but it is valid and locked on
1016 : : * return.
1017 : : */
1018 : : static void
11 tmunro@postgresql.or 1019 :GNC 265372 : ZeroBuffer(Buffer buffer, ReadBufferMode mode)
1020 : : {
1021 : : BufferDesc *bufHdr;
1022 : : uint32 buf_state;
1023 : :
1024 [ + + - + ]: 265372 : Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
1025 : :
1026 [ - + ]: 265372 : if (BufferIsLocal(buffer))
11 tmunro@postgresql.or 1027 :UNC 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1028 : : else
1029 : : {
11 tmunro@postgresql.or 1030 :GNC 265372 : bufHdr = GetBufferDescriptor(buffer - 1);
1031 [ + + ]: 265372 : if (mode == RBM_ZERO_AND_LOCK)
1032 : 263897 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
1033 : : else
1034 : 1475 : LockBufferForCleanup(buffer);
1035 : : }
1036 : :
1037 : 265372 : memset(BufferGetPage(buffer), 0, BLCKSZ);
1038 : :
1039 [ - + ]: 265372 : if (BufferIsLocal(buffer))
1040 : : {
11 tmunro@postgresql.or 1041 :UNC 0 : buf_state = pg_atomic_read_u32(&bufHdr->state);
1042 : 0 : buf_state |= BM_VALID;
1043 : 0 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1044 : : }
1045 : : else
1046 : : {
11 tmunro@postgresql.or 1047 :GNC 265372 : buf_state = LockBufHdr(bufHdr);
1048 : 265372 : buf_state |= BM_VALID;
1049 : 265372 : UnlockBufHdr(bufHdr, buf_state);
1050 : : }
1051 : 265372 : }
1052 : :
1053 : : /*
1054 : : * Pin a buffer for a given block. *foundPtr is set to true if the block was
1055 : : * already present, or false if more work is required to either read it in or
1056 : : * zero it.
1057 : : */
1058 : : static pg_attribute_always_inline Buffer
1059 : 51857100 : PinBufferForBlock(Relation rel,
1060 : : SMgrRelation smgr,
1061 : : char smgr_persistence,
1062 : : ForkNumber forkNum,
1063 : : BlockNumber blockNum,
1064 : : BufferAccessStrategy strategy,
1065 : : bool *foundPtr)
1066 : : {
1067 : : BufferDesc *bufHdr;
1068 : : IOContext io_context;
1069 : : IOObject io_object;
1070 : : char persistence;
1071 : :
1072 [ - + ]: 51857100 : Assert(blockNum != P_NEW);
1073 : :
1074 : : /*
1075 : : * If there is no Relation it usually implies recovery and thus permanent,
1076 : : * but we take an argmument because CreateAndCopyRelationData can reach us
1077 : : * with only an SMgrRelation for an unlogged relation that we don't want
1078 : : * to flag with BM_PERMANENT.
1079 : : */
1080 [ + + ]: 51857100 : if (rel)
1081 : 48511509 : persistence = rel->rd_rel->relpersistence;
1082 [ - + ]: 3345591 : else if (smgr_persistence == 0)
11 tmunro@postgresql.or 1083 :UNC 0 : persistence = RELPERSISTENCE_PERMANENT;
1084 : : else
11 tmunro@postgresql.or 1085 :GNC 3345591 : persistence = smgr_persistence;
1086 : :
1087 [ + + ]: 51857100 : if (persistence == RELPERSISTENCE_TEMP)
1088 : : {
1089 : 1254204 : io_context = IOCONTEXT_NORMAL;
1090 : 1254204 : io_object = IOOBJECT_TEMP_RELATION;
1091 : : }
1092 : : else
1093 : : {
1094 : 50602896 : io_context = IOContextForStrategy(strategy);
1095 : 50602896 : io_object = IOOBJECT_RELATION;
1096 : : }
1097 : :
1098 : : TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1099 : : smgr->smgr_rlocator.locator.spcOid,
1100 : : smgr->smgr_rlocator.locator.dbOid,
1101 : : smgr->smgr_rlocator.locator.relNumber,
1102 : : smgr->smgr_rlocator.backend);
1103 : :
1104 [ + + ]: 51857100 : if (persistence == RELPERSISTENCE_TEMP)
1105 : : {
1106 : 1254204 : bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1107 [ + + ]: 1254204 : if (*foundPtr)
5234 rhaas@postgresql.org 1108 : 1250405 : pgBufferUsage.local_blks_hit++;
1109 : : }
1110 : : else
1111 : : {
11 tmunro@postgresql.or 1112 : 50602896 : bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1113 : : strategy, foundPtr, io_context);
1114 [ + + ]: 50602895 : if (*foundPtr)
1115 : 49223226 : pgBufferUsage.shared_blks_hit++;
1116 : : }
1117 [ + + ]: 51857099 : if (rel)
1118 : : {
1119 : : /*
1120 : : * While pgBufferUsage's "read" counter isn't bumped unless we reach
1121 : : * WaitReadBuffers() (so, not for hits, and not for buffers that are
1122 : : * zeroed instead), the per-relation stats always count them.
1123 : : */
1124 [ + + + + : 48511508 : pgstat_count_buffer_read(rel);
+ + ]
1125 [ + + ]: 48511508 : if (*foundPtr)
1126 [ + + - + : 47552299 : pgstat_count_buffer_hit(rel);
+ + ]
1127 : : }
1128 [ + + ]: 51857099 : if (*foundPtr)
1129 : : {
375 andres@anarazel.de 1130 : 50473631 : VacuumPageHit++;
1131 : 50473631 : pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1132 [ + + ]: 50473631 : if (VacuumCostActive)
1133 : 1864214 : VacuumCostBalance += VacuumCostPageHit;
1134 : :
1135 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1136 : : smgr->smgr_rlocator.locator.spcOid,
1137 : : smgr->smgr_rlocator.locator.dbOid,
1138 : : smgr->smgr_rlocator.locator.relNumber,
1139 : : smgr->smgr_rlocator.backend,
1140 : : true);
1141 : : }
1142 : :
11 tmunro@postgresql.or 1143 : 51857099 : return BufferDescriptorGetBuffer(bufHdr);
1144 : : }
1145 : :
1146 : : /*
1147 : : * ReadBuffer_common -- common logic for all ReadBuffer variants
1148 : : *
1149 : : * smgr is required, rel is optional unless using P_NEW.
1150 : : */
1151 : : static pg_attribute_always_inline Buffer
1152 : 49639253 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
1153 : : ForkNumber forkNum,
1154 : : BlockNumber blockNum, ReadBufferMode mode,
1155 : : BufferAccessStrategy strategy)
1156 : : {
1157 : : ReadBuffersOperation operation;
1158 : : Buffer buffer;
1159 : : int flags;
1160 : :
1161 : : /*
1162 : : * Backward compatibility path, most code should use ExtendBufferedRel()
1163 : : * instead, as acquiring the extension lock inside ExtendBufferedRel()
1164 : : * scales a lot better.
1165 : : */
11 tmunro@postgresql.or 1166 [ + + ]:CBC 49639253 : if (unlikely(blockNum == P_NEW))
1167 : : {
1168 : 240 : uint32 flags = EB_SKIP_EXTENSION_LOCK;
1169 : :
1170 : : /*
1171 : : * Since no-one else can be looking at the page contents yet, there is
1172 : : * no difference between an exclusive lock and a cleanup-strength
1173 : : * lock.
1174 : : */
1175 [ + - - + ]: 240 : if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
11 tmunro@postgresql.or 1176 :UBC 0 : flags |= EB_LOCK_FIRST;
1177 : :
11 tmunro@postgresql.or 1178 :GNC 240 : return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1179 : : }
1180 : :
1181 [ + + + + : 49639013 : if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
+ + ]
1182 : : mode == RBM_ZERO_AND_LOCK))
1183 : : {
1184 : : bool found;
1185 : :
1186 : 265372 : buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
1187 : : forkNum, blockNum, strategy, &found);
1188 : 265372 : ZeroBuffer(buffer, mode);
1189 : 265372 : return buffer;
1190 : : }
1191 : :
1192 [ + + ]: 49373641 : if (mode == RBM_ZERO_ON_ERROR)
1193 : 1204347 : flags = READ_BUFFERS_ZERO_ON_ERROR;
1194 : : else
1195 : 48169294 : flags = 0;
1196 : 49373641 : operation.smgr = smgr;
1197 : 49373641 : operation.rel = rel;
1198 : 49373641 : operation.smgr_persistence = smgr_persistence;
1199 : 49373641 : operation.forknum = forkNum;
1200 : 49373641 : operation.strategy = strategy;
1201 [ + + ]: 49373641 : if (StartReadBuffer(&operation,
1202 : : &buffer,
1203 : : blockNum,
1204 : : flags))
1205 : 856593 : WaitReadBuffers(&operation);
1206 : :
1207 : 49373625 : return buffer;
1208 : : }
1209 : :
1210 : : static pg_attribute_always_inline bool
1211 : 51553537 : StartReadBuffersImpl(ReadBuffersOperation *operation,
1212 : : Buffer *buffers,
1213 : : BlockNumber blockNum,
1214 : : int *nblocks,
1215 : : int flags)
1216 : : {
1217 : 51553537 : int actual_nblocks = *nblocks;
1218 : 51553537 : int io_buffers_len = 0;
1219 : :
1220 [ - + ]: 51553537 : Assert(*nblocks > 0);
1221 [ - + ]: 51553537 : Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1222 : :
1223 [ + + ]: 52701831 : for (int i = 0; i < actual_nblocks; ++i)
1224 : : {
1225 : : bool found;
1226 : :
1227 : 103183455 : buffers[i] = PinBufferForBlock(operation->rel,
1228 : 51591728 : operation->smgr,
1229 : 51591728 : operation->smgr_persistence,
1230 : : operation->forknum,
1231 : : blockNum + i,
1232 : : operation->strategy,
1233 : : &found);
1234 : :
11 tmunro@postgresql.or 1235 [ + + ]:CBC 51591727 : if (found)
1236 : : {
1237 : : /*
1238 : : * Terminate the read as soon as we get a hit. It could be a
1239 : : * single buffer hit, or it could be a hit that follows a readable
1240 : : * range. We don't want to create more than one readable range,
1241 : : * so we stop here.
1242 : : */
11 tmunro@postgresql.or 1243 :GNC 50443433 : actual_nblocks = i + 1;
1244 : 50443433 : break;
1245 : : }
1246 : : else
1247 : : {
1248 : : /* Extend the readable range to cover this block. */
1249 : 1148294 : io_buffers_len++;
1250 : : }
1251 : : }
1252 : 51553536 : *nblocks = actual_nblocks;
1253 : :
1254 [ + + ]: 51553536 : if (likely(io_buffers_len == 0))
1255 : 50442667 : return false;
1256 : :
1257 : : /* Populate information needed for I/O. */
1258 : 1110869 : operation->buffers = buffers;
1259 : 1110869 : operation->blocknum = blockNum;
1260 : 1110869 : operation->flags = flags;
1261 : 1110869 : operation->nblocks = actual_nblocks;
1262 : 1110869 : operation->io_buffers_len = io_buffers_len;
1263 : :
1264 [ + + ]: 1110869 : if (flags & READ_BUFFERS_ISSUE_ADVICE)
1265 : : {
1266 : : /*
1267 : : * In theory we should only do this if PinBufferForBlock() had to
1268 : : * allocate new buffers above. That way, if two calls to
1269 : : * StartReadBuffers() were made for the same blocks before
1270 : : * WaitReadBuffers(), only the first would issue the advice. That'd be
1271 : : * a better simulation of true asynchronous I/O, which would only
1272 : : * start the I/O once, but isn't done here for simplicity. Note also
1273 : : * that the following call might actually issue two advice calls if we
1274 : : * cross a segment boundary; in a true asynchronous version we might
1275 : : * choose to process only one real I/O at a time in that case.
1276 : : */
1277 : 84 : smgrprefetch(operation->smgr,
1278 : : operation->forknum,
1279 : : blockNum,
1280 : 84 : operation->io_buffers_len);
1281 : : }
1282 : :
1283 : : /* Indicate that WaitReadBuffers() should be called. */
1284 : 1110869 : return true;
1285 : : }
1286 : :
1287 : : /*
1288 : : * Begin reading a range of blocks beginning at blockNum and extending for
1289 : : * *nblocks. On return, up to *nblocks pinned buffers holding those blocks
1290 : : * are written into the buffers array, and *nblocks is updated to contain the
1291 : : * actual number, which may be fewer than requested. Caller sets some of the
1292 : : * members of operation; see struct definition.
1293 : : *
1294 : : * If false is returned, no I/O is necessary. If true is returned, one I/O
1295 : : * has been started, and WaitReadBuffers() must be called with the same
1296 : : * operation object before the buffers are accessed. Along with the operation
1297 : : * object, the caller-supplied array of buffers must remain valid until
1298 : : * WaitReadBuffers() is called.
1299 : : *
1300 : : * Currently the I/O is only started with optional operating system advice if
1301 : : * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
1302 : : * happens synchronously in WaitReadBuffers(). In future work, true I/O could
1303 : : * be initiated here.
1304 : : */
1305 : : bool
1306 : 805176 : StartReadBuffers(ReadBuffersOperation *operation,
1307 : : Buffer *buffers,
1308 : : BlockNumber blockNum,
1309 : : int *nblocks,
1310 : : int flags)
1311 : : {
1312 : 805176 : return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
1313 : : }
1314 : :
1315 : : /*
1316 : : * Single block version of the StartReadBuffers(). This might save a few
1317 : : * instructions when called from another translation unit, because it is
1318 : : * specialized for nblocks == 1.
1319 : : */
1320 : : bool
1321 : 50748361 : StartReadBuffer(ReadBuffersOperation *operation,
1322 : : Buffer *buffer,
1323 : : BlockNumber blocknum,
1324 : : int flags)
1325 : : {
1326 : 50748361 : int nblocks = 1;
1327 : : bool result;
1328 : :
1329 : 50748361 : result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
1330 [ - + ]: 50748360 : Assert(nblocks == 1); /* single block can't be short */
1331 : :
1332 : 50748360 : return result;
1333 : : }
1334 : :
1335 : : static inline bool
1336 : 1148290 : WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
1337 : : {
1338 [ + + ]: 1148290 : if (BufferIsLocal(buffer))
1339 : : {
1340 : 3799 : BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1341 : :
1342 : 3799 : return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
1343 : : }
1344 : : else
1345 : 1144491 : return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1346 : : }
1347 : :
1348 : : void
1349 : 1110865 : WaitReadBuffers(ReadBuffersOperation *operation)
1350 : : {
1351 : : Buffer *buffers;
1352 : : int nblocks;
1353 : : BlockNumber blocknum;
1354 : : ForkNumber forknum;
1355 : : IOContext io_context;
1356 : : IOObject io_object;
1357 : : char persistence;
1358 : :
1359 : : /*
1360 : : * Currently operations are only allowed to include a read of some range,
1361 : : * with an optional extra buffer that is already pinned at the end. So
1362 : : * nblocks can be at most one more than io_buffers_len.
1363 : : */
1364 [ + + - + ]: 1110865 : Assert((operation->nblocks == operation->io_buffers_len) ||
1365 : : (operation->nblocks == operation->io_buffers_len + 1));
1366 : :
1367 : : /* Find the range of the physical read we need to perform. */
1368 : 1110865 : nblocks = operation->io_buffers_len;
1369 [ - + ]: 1110865 : if (nblocks == 0)
11 tmunro@postgresql.or 1370 :UNC 0 : return; /* nothing to do */
1371 : :
11 tmunro@postgresql.or 1372 :GNC 1110865 : buffers = &operation->buffers[0];
1373 : 1110865 : blocknum = operation->blocknum;
1374 : 1110865 : forknum = operation->forknum;
1375 : :
1376 [ + + ]: 1110865 : persistence = operation->rel
1377 : 921102 : ? operation->rel->rd_rel->relpersistence
1378 : : : RELPERSISTENCE_PERMANENT;
1379 [ + + ]: 1110865 : if (persistence == RELPERSISTENCE_TEMP)
1380 : : {
1381 : 787 : io_context = IOCONTEXT_NORMAL;
1382 : 787 : io_object = IOOBJECT_TEMP_RELATION;
1383 : : }
1384 : : else
1385 : : {
1386 : 1110078 : io_context = IOContextForStrategy(operation->strategy);
1387 : 1110078 : io_object = IOOBJECT_RELATION;
1388 : : }
1389 : :
1390 : : /*
1391 : : * We count all these blocks as read by this backend. This is traditional
1392 : : * behavior, but might turn out to be not true if we find that someone
1393 : : * else has beaten us and completed the read of some of these blocks. In
1394 : : * that case the system globally double-counts, but we traditionally don't
1395 : : * count this as a "hit", and we don't have a separate counter for "miss,
1396 : : * but another backend completed the read".
1397 : : */
1398 [ + + ]: 1110865 : if (persistence == RELPERSISTENCE_TEMP)
1399 : 787 : pgBufferUsage.local_blks_read += nblocks;
1400 : : else
1401 : 1110078 : pgBufferUsage.shared_blks_read += nblocks;
1402 : :
1403 [ + + ]: 2221715 : for (int i = 0; i < nblocks; ++i)
1404 : : {
1405 : : int io_buffers_len;
1406 : : Buffer io_buffers[MAX_IO_COMBINE_LIMIT];
1407 : : void *io_pages[MAX_IO_COMBINE_LIMIT];
1408 : : instr_time io_start;
1409 : : BlockNumber io_first_block;
1410 : :
1411 : : /*
1412 : : * Skip this block if someone else has already completed it. If an
1413 : : * I/O is already in progress in another backend, this will wait for
1414 : : * the outcome: either done, or something went wrong and we will
1415 : : * retry.
1416 : : */
1417 [ + + ]: 1110865 : if (!WaitReadBuffersCanStartIO(buffers[i], false))
1418 : : {
1419 : : /*
1420 : : * Report this as a 'hit' for this backend, even though it must
1421 : : * have started out as a miss in PinBufferForBlock().
1422 : : */
1423 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
1424 : : operation->smgr->smgr_rlocator.locator.spcOid,
1425 : : operation->smgr->smgr_rlocator.locator.dbOid,
1426 : : operation->smgr->smgr_rlocator.locator.relNumber,
1427 : : operation->smgr->smgr_rlocator.backend,
1428 : : true);
1429 : 5935 : continue;
1430 : : }
1431 : :
1432 : : /* We found a buffer that we need to read in. */
1433 : 1104930 : io_buffers[0] = buffers[i];
1434 : 1104930 : io_pages[0] = BufferGetBlock(buffers[i]);
1435 : 1104930 : io_first_block = blocknum + i;
1436 : 1104930 : io_buffers_len = 1;
1437 : :
1438 : : /*
1439 : : * How many neighboring-on-disk blocks can we can scatter-read into
1440 : : * other buffers at the same time? In this case we don't wait if we
1441 : : * see an I/O already in progress. We already hold BM_IO_IN_PROGRESS
1442 : : * for the head block, so we should get on with that I/O as soon as
1443 : : * possible. We'll come back to this block again, above.
1444 : : */
1445 [ + + + - ]: 1179780 : while ((i + 1) < nblocks &&
1446 : 37425 : WaitReadBuffersCanStartIO(buffers[i + 1], true))
1447 : : {
1448 : : /* Must be consecutive block numbers. */
1449 [ - + ]: 37425 : Assert(BufferGetBlockNumber(buffers[i + 1]) ==
1450 : : BufferGetBlockNumber(buffers[i]) + 1);
1451 : :
1452 : 37425 : io_buffers[io_buffers_len] = buffers[++i];
1453 : 37425 : io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1454 : : }
1455 : :
1456 : 1104930 : io_start = pgstat_prepare_io_time(track_io_timing);
1457 : 1104930 : smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
1458 : 1104915 : pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
1459 : : io_buffers_len);
1460 : :
1461 : : /* Verify each block we read, and terminate the I/O. */
1462 [ + + ]: 2247255 : for (int j = 0; j < io_buffers_len; ++j)
1463 : : {
1464 : : BufferDesc *bufHdr;
1465 : : Block bufBlock;
1466 : :
1467 [ + + ]: 1142340 : if (persistence == RELPERSISTENCE_TEMP)
1468 : : {
1469 : 3799 : bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
1470 : 3799 : bufBlock = LocalBufHdrGetBlock(bufHdr);
1471 : : }
1472 : : else
1473 : : {
1474 : 1138541 : bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
1475 : 1138541 : bufBlock = BufHdrGetBlock(bufHdr);
1476 : : }
1477 : :
1478 : : /* check for garbage data */
1479 [ - + ]: 1142340 : if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
1480 : : PIV_LOG_WARNING | PIV_REPORT_STAT))
1481 : : {
11 tmunro@postgresql.or 1482 [ # # # # ]:UNC 0 : if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
1483 : : {
1484 [ # # ]: 0 : ereport(WARNING,
1485 : : (errcode(ERRCODE_DATA_CORRUPTED),
1486 : : errmsg("invalid page in block %u of relation %s; zeroing out page",
1487 : : io_first_block + j,
1488 : : relpath(operation->smgr->smgr_rlocator, forknum))));
1489 : 0 : memset(bufBlock, 0, BLCKSZ);
1490 : : }
1491 : : else
1492 [ # # ]: 0 : ereport(ERROR,
1493 : : (errcode(ERRCODE_DATA_CORRUPTED),
1494 : : errmsg("invalid page in block %u of relation %s",
1495 : : io_first_block + j,
1496 : : relpath(operation->smgr->smgr_rlocator, forknum))));
1497 : : }
1498 : :
1499 : : /* Terminate I/O and set BM_VALID. */
11 tmunro@postgresql.or 1500 [ + + ]:GNC 1142340 : if (persistence == RELPERSISTENCE_TEMP)
1501 : : {
1502 : 3799 : uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
1503 : :
1504 : 3799 : buf_state |= BM_VALID;
1505 : 3799 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
1506 : : }
1507 : : else
1508 : : {
1509 : : /* Set BM_VALID, terminate IO, and wake up any waiters */
1510 : 1138541 : TerminateBufferIO(bufHdr, false, BM_VALID, true);
1511 : : }
1512 : :
1513 : : /* Report I/Os as completing individually. */
1514 : : TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
1515 : : operation->smgr->smgr_rlocator.locator.spcOid,
1516 : : operation->smgr->smgr_rlocator.locator.dbOid,
1517 : : operation->smgr->smgr_rlocator.locator.relNumber,
1518 : : operation->smgr->smgr_rlocator.backend,
1519 : : false);
1520 : : }
1521 : :
1522 : 1104915 : VacuumPageMiss += io_buffers_len;
1523 [ + + ]: 1104915 : if (VacuumCostActive)
1524 : 15545 : VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1525 : : }
1526 : : }
1527 : :
1528 : : /*
1529 : : * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared
1530 : : * buffer. If no buffer exists already, selects a replacement victim and
1531 : : * evicts the old page, but does NOT read in new page.
1532 : : *
1533 : : * "strategy" can be a buffer replacement strategy object, or NULL for
1534 : : * the default strategy. The selected buffer's usage_count is advanced when
1535 : : * using the default strategy, but otherwise possibly not (see PinBuffer).
1536 : : *
1537 : : * The returned buffer is pinned and is already marked as holding the
1538 : : * desired page. If it already did have the desired page, *foundPtr is
1539 : : * set true. Otherwise, *foundPtr is set false.
1540 : : *
1541 : : * io_context is passed as an output parameter to avoid calling
1542 : : * IOContextForStrategy() when there is a shared buffers hit and no IO
1543 : : * statistics need be captured.
1544 : : *
1545 : : * No locks are held either at entry or exit.
1546 : : */
1547 : : static pg_attribute_always_inline BufferDesc *
4855 rhaas@postgresql.org 1548 :CBC 50602896 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
1549 : : BlockNumber blockNum,
1550 : : BufferAccessStrategy strategy,
1551 : : bool *foundPtr, IOContext io_context)
1552 : : {
1553 : : BufferTag newTag; /* identity of requested block */
1554 : : uint32 newHash; /* hash value for newTag */
1555 : : LWLock *newPartitionLock; /* buffer partition lock for it */
1556 : : int existing_buf_id;
1557 : : Buffer victim_buffer;
1558 : : BufferDesc *victim_buf_hdr;
1559 : : uint32 victim_buf_state;
1560 : :
1561 : : /* Make sure we will have room to remember the buffer pin */
158 heikki.linnakangas@i 1562 :GNC 50602896 : ResourceOwnerEnlarge(CurrentResourceOwner);
1563 : 50602896 : ReservePrivateRefCountEntry();
1564 : :
1565 : : /* create a tag so we can lookup the buffer */
627 rhaas@postgresql.org 1566 :CBC 50602896 : InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1567 : :
1568 : : /* determine its hash code and partition lock ID */
6475 tgl@sss.pgh.pa.us 1569 : 50602896 : newHash = BufTableHashCode(&newTag);
1570 : 50602896 : newPartitionLock = BufMappingPartitionLock(newHash);
1571 : :
1572 : : /* see if the block is in the buffer pool already */
1573 : 50602896 : LWLockAcquire(newPartitionLock, LW_SHARED);
375 andres@anarazel.de 1574 : 50602895 : existing_buf_id = BufTableLookup(&newTag, newHash);
1575 [ + + ]: 50602895 : if (existing_buf_id >= 0)
1576 : : {
1577 : : BufferDesc *buf;
1578 : : bool valid;
1579 : :
1580 : : /*
1581 : : * Found it. Now, pin the buffer so no one can steal it from the
1582 : : * buffer pool, and check to see if the correct data has been loaded
1583 : : * into the buffer.
1584 : : */
1585 : 49226458 : buf = GetBufferDescriptor(existing_buf_id);
1586 : :
6164 tgl@sss.pgh.pa.us 1587 : 49226458 : valid = PinBuffer(buf, strategy);
1588 : :
1589 : : /* Can release the mapping lock as soon as we've pinned it */
6475 1590 : 49226458 : LWLockRelease(newPartitionLock);
1591 : :
2433 peter_e@gmx.net 1592 : 49226458 : *foundPtr = true;
1593 : :
6981 tgl@sss.pgh.pa.us 1594 [ + + ]: 49226458 : if (!valid)
1595 : : {
1596 : : /*
1597 : : * We can only get here if (a) someone else is still reading in
1598 : : * the page, (b) a previous read attempt failed, or (c) someone
1599 : : * called StartReadBuffers() but not yet WaitReadBuffers().
1600 : : */
11 tmunro@postgresql.or 1601 :GNC 3986 : *foundPtr = false;
1602 : : }
1603 : :
9357 bruce@momjian.us 1604 :CBC 49226458 : return buf;
1605 : : }
1606 : :
1607 : : /*
1608 : : * Didn't find it in the buffer pool. We'll have to initialize a new
1609 : : * buffer. Remember to unlock the mapping lock while doing the work.
1610 : : */
6475 tgl@sss.pgh.pa.us 1611 : 1376437 : LWLockRelease(newPartitionLock);
1612 : :
1613 : : /*
1614 : : * Acquire a victim buffer. Somebody else might try to do the same, we
1615 : : * don't hold any conflicting locks. If so we'll have to undo our work
1616 : : * later.
1617 : : */
375 andres@anarazel.de 1618 : 1376437 : victim_buffer = GetVictimBuffer(strategy, io_context);
1619 : 1376437 : victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1620 : :
1621 : : /*
1622 : : * Try to make a hashtable entry for the buffer under its new tag. If
1623 : : * somebody else inserted another buffer for the tag, we'll release the
1624 : : * victim buffer we acquired and use the already inserted one.
1625 : : */
1626 : 1376437 : LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1627 : 1376437 : existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1628 [ + + ]: 1376437 : if (existing_buf_id >= 0)
1629 : : {
1630 : : BufferDesc *existing_buf_hdr;
1631 : : bool valid;
1632 : :
1633 : : /*
1634 : : * Got a collision. Someone has already done what we were about to do.
1635 : : * We'll just handle this as if it were found in the buffer pool in
1636 : : * the first place. First, give up the buffer we were planning to
1637 : : * use.
1638 : : *
1639 : : * We could do this after releasing the partition lock, but then we'd
1640 : : * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1641 : : * before acquiring the lock, for the rare case of such a collision.
1642 : : */
1643 : 2718 : UnpinBuffer(victim_buf_hdr);
1644 : :
1645 : : /*
1646 : : * The victim buffer we acquired previously is clean and unused, let
1647 : : * it be found again quickly
1648 : : */
1649 : 2718 : StrategyFreeBuffer(victim_buf_hdr);
1650 : :
1651 : : /* remaining code should match code at top of routine */
1652 : :
1653 : 2718 : existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1654 : :
1655 : 2718 : valid = PinBuffer(existing_buf_hdr, strategy);
1656 : :
1657 : : /* Can release the mapping lock as soon as we've pinned it */
1658 : 2718 : LWLockRelease(newPartitionLock);
1659 : :
1660 : 2718 : *foundPtr = true;
1661 : :
1662 [ + + ]: 2718 : if (!valid)
1663 : : {
1664 : : /*
1665 : : * We can only get here if (a) someone else is still reading in
1666 : : * the page, (b) a previous read attempt failed, or (c) someone
1667 : : * called StartReadBuffers() but not yet WaitReadBuffers().
1668 : : */
11 tmunro@postgresql.or 1669 :GNC 1964 : *foundPtr = false;
1670 : : }
1671 : :
375 andres@anarazel.de 1672 :CBC 2718 : return existing_buf_hdr;
1673 : : }
1674 : :
1675 : : /*
1676 : : * Need to lock the buffer header too in order to change its tag.
1677 : : */
1678 : 1373719 : victim_buf_state = LockBufHdr(victim_buf_hdr);
1679 : :
1680 : : /* some sanity checks while we hold the buffer header lock */
1681 [ - + ]: 1373719 : Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1682 [ - + ]: 1373719 : Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1683 : :
1684 : 1373719 : victim_buf_hdr->tag = newTag;
1685 : :
1686 : : /*
1687 : : * Make sure BM_PERMANENT is set for buffers that must be written at every
1688 : : * checkpoint. Unlogged buffers only need to be written at shutdown
1689 : : * checkpoints, except for their "init" forks, which need to be treated
1690 : : * just like permanent relations.
1691 : : */
1692 : 1373719 : victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2588 rhaas@postgresql.org 1693 [ + + - + ]: 1373719 : if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
375 andres@anarazel.de 1694 : 1373674 : victim_buf_state |= BM_PERMANENT;
1695 : :
1696 : 1373719 : UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1697 : :
6475 tgl@sss.pgh.pa.us 1698 : 1373719 : LWLockRelease(newPartitionLock);
1699 : :
1700 : : /*
1701 : : * Buffer contents are currently invalid.
1702 : : */
11 tmunro@postgresql.or 1703 :GNC 1373719 : *foundPtr = false;
1704 : :
375 andres@anarazel.de 1705 :CBC 1373719 : return victim_buf_hdr;
1706 : : }
1707 : :
1708 : : /*
1709 : : * InvalidateBuffer -- mark a shared buffer invalid and return it to the
1710 : : * freelist.
1711 : : *
1712 : : * The buffer header spinlock must be held at entry. We drop it before
1713 : : * returning. (This is sane because the caller must have locked the
1714 : : * buffer in order to be sure it should be dropped.)
1715 : : *
1716 : : * This is used only in contexts such as dropping a relation. We assume
1717 : : * that no other backend could possibly be interested in using the page,
1718 : : * so the only reason the buffer might be pinned is if someone else is
1719 : : * trying to write it out. We have to let them finish before we can
1720 : : * reclaim the buffer.
1721 : : *
1722 : : * The buffer could get reclaimed by someone else while we are waiting
1723 : : * to acquire the necessary locks; if so, don't mess it up.
1724 : : */
1725 : : static void
3072 rhaas@postgresql.org 1726 : 93608 : InvalidateBuffer(BufferDesc *buf)
1727 : : {
1728 : : BufferTag oldTag;
1729 : : uint32 oldHash; /* hash value for oldTag */
1730 : : LWLock *oldPartitionLock; /* buffer partition lock for it */
1731 : : uint32 oldFlags;
1732 : : uint32 buf_state;
1733 : :
1734 : : /* Save the original buffer tag before dropping the spinlock */
6981 tgl@sss.pgh.pa.us 1735 : 93608 : oldTag = buf->tag;
1736 : :
2926 andres@anarazel.de 1737 : 93608 : buf_state = pg_atomic_read_u32(&buf->state);
1738 [ - + ]: 93608 : Assert(buf_state & BM_LOCKED);
1739 : 93608 : UnlockBufHdr(buf, buf_state);
1740 : :
1741 : : /*
1742 : : * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1743 : : * worth storing the hashcode in BufferDesc so we need not recompute it
1744 : : * here? Probably not.
1745 : : */
6475 tgl@sss.pgh.pa.us 1746 : 93608 : oldHash = BufTableHashCode(&oldTag);
1747 : 93608 : oldPartitionLock = BufMappingPartitionLock(oldHash);
1748 : :
6981 1749 : 93609 : retry:
1750 : :
1751 : : /*
1752 : : * Acquire exclusive mapping lock in preparation for changing the buffer's
1753 : : * association.
1754 : : */
6475 1755 : 93609 : LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1756 : :
1757 : : /* Re-lock the buffer header */
2926 andres@anarazel.de 1758 : 93609 : buf_state = LockBufHdr(buf);
1759 : :
1760 : : /* If it's changed while we were waiting for lock, do nothing */
627 rhaas@postgresql.org 1761 [ + + ]: 93609 : if (!BufferTagsEqual(&buf->tag, &oldTag))
1762 : : {
2926 andres@anarazel.de 1763 : 1 : UnlockBufHdr(buf, buf_state);
6475 tgl@sss.pgh.pa.us 1764 : 1 : LWLockRelease(oldPartitionLock);
6981 1765 : 1 : return;
1766 : : }
1767 : :
1768 : : /*
1769 : : * We assume the only reason for it to be pinned is that someone else is
1770 : : * flushing the page out. Wait for them to finish. (This could be an
1771 : : * infinite loop if the refcount is messed up... it would be nice to time
1772 : : * out after awhile, but there seems no way to be sure how many loops may
1773 : : * be needed. Note that if the other guy has pinned the buffer but not
1774 : : * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1775 : : * be busy-looping here.)
1776 : : */
2926 andres@anarazel.de 1777 [ + + ]: 93608 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1778 : : {
1779 : 1 : UnlockBufHdr(buf, buf_state);
6475 tgl@sss.pgh.pa.us 1780 : 1 : LWLockRelease(oldPartitionLock);
1781 : : /* safety check: should definitely not be our *own* pin */
3168 andres@anarazel.de 1782 [ - + ]: 1 : if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
6967 tgl@sss.pgh.pa.us 1783 [ # # ]:UBC 0 : elog(ERROR, "buffer is pinned in InvalidateBuffer");
6981 tgl@sss.pgh.pa.us 1784 :CBC 1 : WaitIO(buf);
1785 : 1 : goto retry;
1786 : : }
1787 : :
1788 : : /*
1789 : : * Clear out the buffer's tag and flags. We must do this to ensure that
1790 : : * linear scans of the buffer array don't think the buffer is valid.
1791 : : */
2926 andres@anarazel.de 1792 : 93607 : oldFlags = buf_state & BUF_FLAG_MASK;
627 rhaas@postgresql.org 1793 : 93607 : ClearBufferTag(&buf->tag);
2926 andres@anarazel.de 1794 : 93607 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1795 : 93607 : UnlockBufHdr(buf, buf_state);
1796 : :
1797 : : /*
1798 : : * Remove the buffer from the lookup hashtable, if it was in there.
1799 : : */
6981 tgl@sss.pgh.pa.us 1800 [ + - ]: 93607 : if (oldFlags & BM_TAG_VALID)
6475 1801 : 93607 : BufTableDelete(&oldTag, oldHash);
1802 : :
1803 : : /*
1804 : : * Done with mapping lock.
1805 : : */
1806 : 93607 : LWLockRelease(oldPartitionLock);
1807 : :
1808 : : /*
1809 : : * Insert the buffer at the head of the list of free buffers.
1810 : : */
6164 1811 : 93607 : StrategyFreeBuffer(buf);
1812 : : }
1813 : :
1814 : : /*
1815 : : * Helper routine for GetVictimBuffer()
1816 : : *
1817 : : * Needs to be called on a buffer with a valid tag, pinned, but without the
1818 : : * buffer header spinlock held.
1819 : : *
1820 : : * Returns true if the buffer can be reused, in which case the buffer is only
1821 : : * pinned by this backend and marked as invalid, false otherwise.
1822 : : */
1823 : : static bool
375 andres@anarazel.de 1824 : 1003491 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
1825 : : {
1826 : : uint32 buf_state;
1827 : : uint32 hash;
1828 : : LWLock *partition_lock;
1829 : : BufferTag tag;
1830 : :
1831 [ - + ]: 1003491 : Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
1832 : :
1833 : : /* have buffer pinned, so it's safe to read tag without lock */
1834 : 1003491 : tag = buf_hdr->tag;
1835 : :
1836 : 1003491 : hash = BufTableHashCode(&tag);
1837 : 1003491 : partition_lock = BufMappingPartitionLock(hash);
1838 : :
1839 : 1003491 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1840 : :
1841 : : /* lock the buffer header */
1842 : 1003491 : buf_state = LockBufHdr(buf_hdr);
1843 : :
1844 : : /*
1845 : : * We have the buffer pinned nobody else should have been able to unset
1846 : : * this concurrently.
1847 : : */
1848 [ - + ]: 1003491 : Assert(buf_state & BM_TAG_VALID);
1849 [ - + ]: 1003491 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1850 [ - + ]: 1003491 : Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1851 : :
1852 : : /*
1853 : : * If somebody else pinned the buffer since, or even worse, dirtied it,
1854 : : * give up on this buffer: It's clearly in use.
1855 : : */
1856 [ + + + + ]: 1003491 : if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1857 : : {
1858 [ - + ]: 278 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1859 : :
1860 : 278 : UnlockBufHdr(buf_hdr, buf_state);
1861 : 278 : LWLockRelease(partition_lock);
1862 : :
1863 : 278 : return false;
1864 : : }
1865 : :
1866 : : /*
1867 : : * Clear out the buffer's tag and flags and usagecount. This is not
1868 : : * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1869 : : * doing anything with the buffer. But currently it's beneficial, as the
1870 : : * cheaper pre-check for several linear scans of shared buffers use the
1871 : : * tag (see e.g. FlushDatabaseBuffers()).
1872 : : */
1873 : 1003213 : ClearBufferTag(&buf_hdr->tag);
1874 : 1003213 : buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1875 : 1003213 : UnlockBufHdr(buf_hdr, buf_state);
1876 : :
1877 [ - + ]: 1003213 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1878 : :
1879 : : /* finally delete buffer from the buffer mapping table */
1880 : 1003213 : BufTableDelete(&tag, hash);
1881 : :
1882 : 1003213 : LWLockRelease(partition_lock);
1883 : :
1884 [ - + ]: 1003213 : Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1885 [ - + ]: 1003213 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1886 [ - + ]: 1003213 : Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
1887 : :
1888 : 1003213 : return true;
1889 : : }
1890 : :
1891 : : static Buffer
1892 : 1582759 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
1893 : : {
1894 : : BufferDesc *buf_hdr;
1895 : : Buffer buf;
1896 : : uint32 buf_state;
1897 : : bool from_ring;
1898 : :
1899 : : /*
1900 : : * Ensure, while the spinlock's not yet held, that there's a free refcount
1901 : : * entry, and a resource owner slot for the pin.
1902 : : */
1903 : 1582759 : ReservePrivateRefCountEntry();
158 heikki.linnakangas@i 1904 :GNC 1582759 : ResourceOwnerEnlarge(CurrentResourceOwner);
1905 : :
1906 : : /* we return here if a prospective victim buffer gets used concurrently */
375 andres@anarazel.de 1907 :CBC 4398 : again:
1908 : :
1909 : : /*
1910 : : * Select a victim buffer. The buffer is returned with its header
1911 : : * spinlock still held!
1912 : : */
1913 : 1587157 : buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1914 : 1587157 : buf = BufferDescriptorGetBuffer(buf_hdr);
1915 : :
1916 [ - + ]: 1587157 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1917 : :
1918 : : /* Pin the buffer and then release the buffer spinlock */
1919 : 1587157 : PinBuffer_Locked(buf_hdr);
1920 : :
1921 : : /*
1922 : : * We shouldn't have any other pins for this buffer.
1923 : : */
1924 : 1587157 : CheckBufferIsPinnedOnce(buf);
1925 : :
1926 : : /*
1927 : : * If the buffer was dirty, try to write it out. There is a race
1928 : : * condition here, in that someone might dirty it after we released the
1929 : : * buffer header lock above, or even while we are writing it out (since
1930 : : * our share-lock won't prevent hint-bit updates). We will recheck the
1931 : : * dirty bit after re-locking the buffer header.
1932 : : */
1933 [ + + ]: 1587157 : if (buf_state & BM_DIRTY)
1934 : : {
1935 : : LWLock *content_lock;
1936 : :
1937 [ - + ]: 256301 : Assert(buf_state & BM_TAG_VALID);
1938 [ - + ]: 256301 : Assert(buf_state & BM_VALID);
1939 : :
1940 : : /*
1941 : : * We need a share-lock on the buffer contents to write it out (else
1942 : : * we might write invalid data, eg because someone else is compacting
1943 : : * the page contents while we write). We must use a conditional lock
1944 : : * acquisition here to avoid deadlock. Even though the buffer was not
1945 : : * pinned (and therefore surely not locked) when StrategyGetBuffer
1946 : : * returned it, someone else could have pinned and exclusive-locked it
1947 : : * by the time we get here. If we try to get the lock unconditionally,
1948 : : * we'd block waiting for them; if they later block waiting for us,
1949 : : * deadlock ensues. (This has been observed to happen when two
1950 : : * backends are both trying to split btree index pages, and the second
1951 : : * one just happens to be trying to split the page the first one got
1952 : : * from StrategyGetBuffer.)
1953 : : */
1954 : 256301 : content_lock = BufferDescriptorGetContentLock(buf_hdr);
1955 [ - + ]: 256301 : if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1956 : : {
1957 : : /*
1958 : : * Someone else has locked the buffer, so give it up and loop back
1959 : : * to get another one.
1960 : : */
375 andres@anarazel.de 1961 :UBC 0 : UnpinBuffer(buf_hdr);
1962 : 0 : goto again;
1963 : : }
1964 : :
1965 : : /*
1966 : : * If using a nondefault strategy, and writing the buffer would
1967 : : * require a WAL flush, let the strategy decide whether to go ahead
1968 : : * and write/reuse the buffer or to choose another victim. We need a
1969 : : * lock to inspect the page LSN, so this can't be done inside
1970 : : * StrategyGetBuffer.
1971 : : */
375 andres@anarazel.de 1972 [ + + ]:CBC 256301 : if (strategy != NULL)
1973 : : {
1974 : : XLogRecPtr lsn;
1975 : :
1976 : : /* Read the LSN while holding buffer header lock */
1977 : 62421 : buf_state = LockBufHdr(buf_hdr);
1978 : 62421 : lsn = BufferGetLSN(buf_hdr);
1979 : 62421 : UnlockBufHdr(buf_hdr, buf_state);
1980 : :
1981 [ + + ]: 62421 : if (XLogNeedsFlush(lsn)
1982 [ + + ]: 6524 : && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
1983 : : {
1984 : 4120 : LWLockRelease(content_lock);
1985 : 4120 : UnpinBuffer(buf_hdr);
1986 : 4120 : goto again;
1987 : : }
1988 : : }
1989 : :
1990 : : /* OK, do the I/O */
1991 : 252181 : FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
1992 : 252181 : LWLockRelease(content_lock);
1993 : :
333 1994 : 252181 : ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
1995 : : &buf_hdr->tag);
1996 : : }
1997 : :
1998 : :
375 1999 [ + + ]: 1583037 : if (buf_state & BM_VALID)
2000 : : {
2001 : : /*
2002 : : * When a BufferAccessStrategy is in use, blocks evicted from shared
2003 : : * buffers are counted as IOOP_EVICT in the corresponding context
2004 : : * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2005 : : * strategy in two cases: 1) while initially claiming buffers for the
2006 : : * strategy ring 2) to replace an existing strategy ring buffer
2007 : : * because it is pinned or in use and cannot be reused.
2008 : : *
2009 : : * Blocks evicted from buffers already in the strategy ring are
2010 : : * counted as IOOP_REUSE in the corresponding strategy context.
2011 : : *
2012 : : * At this point, we can accurately count evictions and reuses,
2013 : : * because we have successfully claimed the valid buffer. Previously,
2014 : : * we may have been forced to release the buffer due to concurrent
2015 : : * pinners or erroring out.
2016 : : */
2017 : 1003490 : pgstat_count_io_op(IOOBJECT_RELATION, io_context,
2018 [ + + ]: 1003490 : from_ring ? IOOP_REUSE : IOOP_EVICT);
2019 : : }
2020 : :
2021 : : /*
2022 : : * If the buffer has an entry in the buffer mapping table, delete it. This
2023 : : * can fail because another backend could have pinned or dirtied the
2024 : : * buffer.
2025 : : */
2026 [ + + + + ]: 1583037 : if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2027 : : {
2028 : 278 : UnpinBuffer(buf_hdr);
2029 : 278 : goto again;
2030 : : }
2031 : :
2032 : : /* a final set of sanity checks */
2033 : : #ifdef USE_ASSERT_CHECKING
2034 : 1582759 : buf_state = pg_atomic_read_u32(&buf_hdr->state);
2035 : :
2036 [ - + ]: 1582759 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2037 [ - + ]: 1582759 : Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2038 : :
2039 : 1582759 : CheckBufferIsPinnedOnce(buf);
2040 : : #endif
2041 : :
2042 : 1582759 : return buf;
2043 : : }
2044 : :
2045 : : /*
2046 : : * Limit the number of pins a batch operation may additionally acquire, to
2047 : : * avoid running out of pinnable buffers.
2048 : : *
2049 : : * One additional pin is always allowed, as otherwise the operation likely
2050 : : * cannot be performed at all.
2051 : : *
2052 : : * The number of allowed pins for a backend is computed based on
2053 : : * shared_buffers and the maximum number of connections possible. That's very
2054 : : * pessimistic, but outside of toy-sized shared_buffers it should allow
2055 : : * sufficient pins.
2056 : : */
2057 : : void
2058 : 495486 : LimitAdditionalPins(uint32 *additional_pins)
2059 : : {
2060 : : uint32 max_backends;
2061 : : int max_proportional_pins;
2062 : :
2063 [ + + ]: 495486 : if (*additional_pins <= 1)
2064 : 177717 : return;
2065 : :
2066 : 317769 : max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2067 : 317769 : max_proportional_pins = NBuffers / max_backends;
2068 : :
2069 : : /*
2070 : : * Subtract the approximate number of buffers already pinned by this
2071 : : * backend. We get the number of "overflowed" pins for free, but don't
2072 : : * know the number of pins in PrivateRefCountArray. The cost of
2073 : : * calculating that exactly doesn't seem worth it, so just assume the max.
2074 : : */
2075 : 317769 : max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2076 : :
265 2077 [ + + ]: 317769 : if (max_proportional_pins <= 0)
375 2078 : 72504 : max_proportional_pins = 1;
2079 : :
2080 [ + + ]: 317769 : if (*additional_pins > max_proportional_pins)
2081 : 73214 : *additional_pins = max_proportional_pins;
2082 : : }
2083 : :
2084 : : /*
2085 : : * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
2086 : : * avoid duplicating the tracing and relpersistence related logic.
2087 : : */
2088 : : static BlockNumber
235 tmunro@postgresql.or 2089 : 195797 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
2090 : : ForkNumber fork,
2091 : : BufferAccessStrategy strategy,
2092 : : uint32 flags,
2093 : : uint32 extend_by,
2094 : : BlockNumber extend_upto,
2095 : : Buffer *buffers,
2096 : : uint32 *extended_by)
2097 : : {
2098 : : BlockNumber first_block;
2099 : :
2100 : : TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2101 : : bmr.smgr->smgr_rlocator.locator.spcOid,
2102 : : bmr.smgr->smgr_rlocator.locator.dbOid,
2103 : : bmr.smgr->smgr_rlocator.locator.relNumber,
2104 : : bmr.smgr->smgr_rlocator.backend,
2105 : : extend_by);
2106 : :
2107 [ + + ]: 195797 : if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2108 : 9044 : first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2109 : : extend_by, extend_upto,
2110 : : buffers, &extend_by);
2111 : : else
2112 : 186753 : first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2113 : : extend_by, extend_upto,
2114 : : buffers, &extend_by);
375 andres@anarazel.de 2115 : 195797 : *extended_by = extend_by;
2116 : :
2117 : : TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2118 : : bmr.smgr->smgr_rlocator.locator.spcOid,
2119 : : bmr.smgr->smgr_rlocator.locator.dbOid,
2120 : : bmr.smgr->smgr_rlocator.locator.relNumber,
2121 : : bmr.smgr->smgr_rlocator.backend,
2122 : : *extended_by,
2123 : : first_block);
2124 : :
2125 : 195797 : return first_block;
2126 : : }
2127 : :
2128 : : /*
2129 : : * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
2130 : : * shared buffers.
2131 : : */
2132 : : static BlockNumber
235 tmunro@postgresql.or 2133 : 186753 : ExtendBufferedRelShared(BufferManagerRelation bmr,
2134 : : ForkNumber fork,
2135 : : BufferAccessStrategy strategy,
2136 : : uint32 flags,
2137 : : uint32 extend_by,
2138 : : BlockNumber extend_upto,
2139 : : Buffer *buffers,
2140 : : uint32 *extended_by)
2141 : : {
2142 : : BlockNumber first_block;
375 andres@anarazel.de 2143 : 186753 : IOContext io_context = IOContextForStrategy(strategy);
2144 : : instr_time io_start;
2145 : :
2146 : 186753 : LimitAdditionalPins(&extend_by);
2147 : :
2148 : : /*
2149 : : * Acquire victim buffers for extension without holding extension lock.
2150 : : * Writing out victim buffers is the most expensive part of extending the
2151 : : * relation, particularly when doing so requires WAL flushes. Zeroing out
2152 : : * the buffers is also quite expensive, so do that before holding the
2153 : : * extension lock as well.
2154 : : *
2155 : : * These pages are pinned by us and not valid. While we hold the pin they
2156 : : * can't be acquired as victim buffers by another backend.
2157 : : */
2158 [ + + ]: 393075 : for (uint32 i = 0; i < extend_by; i++)
2159 : : {
2160 : : Block buf_block;
2161 : :
2162 : 206322 : buffers[i] = GetVictimBuffer(strategy, io_context);
2163 : 206322 : buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2164 : :
2165 : : /* new buffers are zero-filled */
2166 [ + - + - : 206322 : MemSet((char *) buf_block, 0, BLCKSZ);
+ - - + -
- ]
2167 : : }
2168 : :
2169 : : /*
2170 : : * Lock relation against concurrent extensions, unless requested not to.
2171 : : *
2172 : : * We use the same extension lock for all forks. That's unnecessarily
2173 : : * restrictive, but currently extensions for forks don't happen often
2174 : : * enough to make it worth locking more granularly.
2175 : : *
2176 : : * Note that another backend might have extended the relation by the time
2177 : : * we get the lock.
2178 : : */
2179 [ + + ]: 186753 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
235 tmunro@postgresql.or 2180 : 131904 : LockRelationForExtension(bmr.rel, ExclusiveLock);
2181 : :
2182 : : /*
2183 : : * If requested, invalidate size cache, so that smgrnblocks asks the
2184 : : * kernel.
2185 : : */
375 andres@anarazel.de 2186 [ + + ]: 186753 : if (flags & EB_CLEAR_SIZE_CACHE)
235 tmunro@postgresql.or 2187 : 6383 : bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2188 : :
2189 : 186753 : first_block = smgrnblocks(bmr.smgr, fork);
2190 : :
2191 : : /*
2192 : : * Now that we have the accurate relation size, check if the caller wants
2193 : : * us to extend to only up to a specific size. If there were concurrent
2194 : : * extensions, we might have acquired too many buffers and need to release
2195 : : * them.
2196 : : */
375 andres@anarazel.de 2197 [ + + ]: 186753 : if (extend_upto != InvalidBlockNumber)
2198 : : {
2199 : 55475 : uint32 orig_extend_by = extend_by;
2200 : :
2201 [ - + ]: 55475 : if (first_block > extend_upto)
375 andres@anarazel.de 2202 :UBC 0 : extend_by = 0;
375 andres@anarazel.de 2203 [ + + ]:CBC 55475 : else if ((uint64) first_block + extend_by > extend_upto)
2204 : 13 : extend_by = extend_upto - first_block;
2205 : :
2206 [ + + ]: 55506 : for (uint32 i = extend_by; i < orig_extend_by; i++)
2207 : : {
2208 : 31 : BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2209 : :
2210 : : /*
2211 : : * The victim buffer we acquired previously is clean and unused,
2212 : : * let it be found again quickly
2213 : : */
2214 : 31 : StrategyFreeBuffer(buf_hdr);
2215 : 31 : UnpinBuffer(buf_hdr);
2216 : : }
2217 : :
2218 [ + + ]: 55475 : if (extend_by == 0)
2219 : : {
2220 [ + - ]: 13 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
235 tmunro@postgresql.or 2221 : 13 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
375 andres@anarazel.de 2222 : 13 : *extended_by = extend_by;
2223 : 13 : return first_block;
2224 : : }
2225 : : }
2226 : :
2227 : : /* Fail if relation is already at maximum possible length */
2228 [ - + ]: 186740 : if ((uint64) first_block + extend_by >= MaxBlockNumber)
375 andres@anarazel.de 2229 [ # # ]:UBC 0 : ereport(ERROR,
2230 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2231 : : errmsg("cannot extend relation %s beyond %u blocks",
2232 : : relpath(bmr.smgr->smgr_rlocator, fork),
2233 : : MaxBlockNumber)));
2234 : :
2235 : : /*
2236 : : * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2237 : : *
2238 : : * This needs to happen before we extend the relation, because as soon as
2239 : : * we do, other backends can start to read in those pages.
2240 : : */
208 peter@eisentraut.org 2241 [ + + ]:GNC 393031 : for (uint32 i = 0; i < extend_by; i++)
2242 : : {
375 andres@anarazel.de 2243 :CBC 206291 : Buffer victim_buf = buffers[i];
2244 : 206291 : BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2245 : : BufferTag tag;
2246 : : uint32 hash;
2247 : : LWLock *partition_lock;
2248 : : int existing_id;
2249 : :
2250 : : /* in case we need to pin an existing buffer below */
158 heikki.linnakangas@i 2251 :GNC 206291 : ResourceOwnerEnlarge(CurrentResourceOwner);
2252 : 206291 : ReservePrivateRefCountEntry();
2253 : :
235 tmunro@postgresql.or 2254 :CBC 206291 : InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
375 andres@anarazel.de 2255 : 206291 : hash = BufTableHashCode(&tag);
2256 : 206291 : partition_lock = BufMappingPartitionLock(hash);
2257 : :
2258 : 206291 : LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2259 : :
2260 : 206291 : existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2261 : :
2262 : : /*
2263 : : * We get here only in the corner case where we are trying to extend
2264 : : * the relation but we found a pre-existing buffer. This can happen
2265 : : * because a prior attempt at extending the relation failed, and
2266 : : * because mdread doesn't complain about reads beyond EOF (when
2267 : : * zero_damaged_pages is ON) and so a previous attempt to read a block
2268 : : * beyond EOF could have left a "valid" zero-filled buffer.
2269 : : * Unfortunately, we have also seen this case occurring because of
2270 : : * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2271 : : * that doesn't account for a recent write. In that situation, the
2272 : : * pre-existing buffer would contain valid data that we don't want to
2273 : : * overwrite. Since the legitimate cases should always have left a
2274 : : * zero-filled buffer, complain if not PageIsNew.
2275 : : */
2276 [ - + ]: 206291 : if (existing_id >= 0)
2277 : : {
375 andres@anarazel.de 2278 :UBC 0 : BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2279 : : Block buf_block;
2280 : : bool valid;
2281 : :
2282 : : /*
2283 : : * Pin the existing buffer before releasing the partition lock,
2284 : : * preventing it from being evicted.
2285 : : */
2286 : 0 : valid = PinBuffer(existing_hdr, strategy);
2287 : :
2288 : 0 : LWLockRelease(partition_lock);
2289 : :
2290 : : /*
2291 : : * The victim buffer we acquired previously is clean and unused,
2292 : : * let it be found again quickly
2293 : : */
2294 : 0 : StrategyFreeBuffer(victim_buf_hdr);
2295 : 0 : UnpinBuffer(victim_buf_hdr);
2296 : :
2297 : 0 : buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2298 : 0 : buf_block = BufHdrGetBlock(existing_hdr);
2299 : :
2300 [ # # # # ]: 0 : if (valid && !PageIsNew((Page) buf_block))
2301 [ # # ]: 0 : ereport(ERROR,
2302 : : (errmsg("unexpected data beyond EOF in block %u of relation %s",
2303 : : existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2304 : : errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2305 : :
2306 : : /*
2307 : : * We *must* do smgr[zero]extend before succeeding, else the page
2308 : : * will not be reserved by the kernel, and the next P_NEW call
2309 : : * will decide to return the same page. Clear the BM_VALID bit,
2310 : : * do StartBufferIO() and proceed.
2311 : : *
2312 : : * Loop to handle the very small possibility that someone re-sets
2313 : : * BM_VALID between our clearing it and StartBufferIO inspecting
2314 : : * it.
2315 : : */
2316 : : do
2317 : : {
2318 : 0 : uint32 buf_state = LockBufHdr(existing_hdr);
2319 : :
2320 : 0 : buf_state &= ~BM_VALID;
2321 : 0 : UnlockBufHdr(existing_hdr, buf_state);
11 tmunro@postgresql.or 2322 [ # # ]:UNC 0 : } while (!StartBufferIO(existing_hdr, true, false));
2323 : : }
2324 : : else
2325 : : {
2326 : : uint32 buf_state;
2327 : :
375 andres@anarazel.de 2328 :CBC 206291 : buf_state = LockBufHdr(victim_buf_hdr);
2329 : :
2330 : : /* some sanity checks while we hold the buffer header lock */
2331 [ - + ]: 206291 : Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2332 [ - + ]: 206291 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2333 : :
2334 : 206291 : victim_buf_hdr->tag = tag;
2335 : :
2336 : 206291 : buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
235 tmunro@postgresql.or 2337 [ + + + + ]: 206291 : if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
375 andres@anarazel.de 2338 : 202198 : buf_state |= BM_PERMANENT;
2339 : :
2340 : 206291 : UnlockBufHdr(victim_buf_hdr, buf_state);
2341 : :
2342 : 206291 : LWLockRelease(partition_lock);
2343 : :
2344 : : /* XXX: could combine the locked operations in it with the above */
11 tmunro@postgresql.or 2345 :GNC 206291 : StartBufferIO(victim_buf_hdr, true, false);
2346 : : }
2347 : : }
2348 : :
120 michael@paquier.xyz 2349 : 186740 : io_start = pgstat_prepare_io_time(track_io_timing);
2350 : :
2351 : : /*
2352 : : * Note: if smgrzeroextend fails, we will end up with buffers that are
2353 : : * allocated but not marked BM_VALID. The next relation extension will
2354 : : * still select the same block number (because the relation didn't get any
2355 : : * longer on disk) and so future attempts to extend the relation will find
2356 : : * the same buffers (if they have not been recycled) but come right back
2357 : : * here to try smgrzeroextend again.
2358 : : *
2359 : : * We don't need to set checksum for all-zero pages.
2360 : : */
235 tmunro@postgresql.or 2361 :CBC 186740 : smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2362 : :
2363 : : /*
2364 : : * Release the file-extension lock; it's now OK for someone else to extend
2365 : : * the relation some more.
2366 : : *
2367 : : * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2368 : : * take noticeable time.
2369 : : */
375 andres@anarazel.de 2370 [ + + ]: 186740 : if (!(flags & EB_SKIP_EXTENSION_LOCK))
235 tmunro@postgresql.or 2371 : 131891 : UnlockRelationForExtension(bmr.rel, ExclusiveLock);
2372 : :
373 andres@anarazel.de 2373 : 186740 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
2374 : : io_start, extend_by);
2375 : :
2376 : : /* Set BM_VALID, terminate IO, and wake up any waiters */
208 peter@eisentraut.org 2377 [ + + ]:GNC 393031 : for (uint32 i = 0; i < extend_by; i++)
2378 : : {
375 andres@anarazel.de 2379 :CBC 206291 : Buffer buf = buffers[i];
2380 : 206291 : BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2381 : 206291 : bool lock = false;
2382 : :
2383 [ + + + + ]: 206291 : if (flags & EB_LOCK_FIRST && i == 0)
2384 : 131038 : lock = true;
2385 [ + + ]: 75253 : else if (flags & EB_LOCK_TARGET)
2386 : : {
2387 [ - + ]: 48059 : Assert(extend_upto != InvalidBlockNumber);
2388 [ + + ]: 48059 : if (first_block + i + 1 == extend_upto)
2389 : 47502 : lock = true;
2390 : : }
2391 : :
2392 [ + + ]: 206291 : if (lock)
2393 : 178540 : LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
2394 : :
158 heikki.linnakangas@i 2395 :GNC 206291 : TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2396 : : }
2397 : :
375 andres@anarazel.de 2398 :CBC 186740 : pgBufferUsage.shared_blks_written += extend_by;
2399 : :
2400 : 186740 : *extended_by = extend_by;
2401 : :
2402 : 186740 : return first_block;
2403 : : }
2404 : :
2405 : : /*
2406 : : * BufferIsExclusiveLocked
2407 : : *
2408 : : * Checks if buffer is exclusive-locked.
2409 : : *
2410 : : * Buffer must be pinned.
2411 : : */
2412 : : bool
174 jdavis@postgresql.or 2413 :GNC 13823361 : BufferIsExclusiveLocked(Buffer buffer)
2414 : : {
2415 : : BufferDesc *bufHdr;
2416 : :
2417 [ - + ]: 13823361 : if (BufferIsLocal(buffer))
2418 : : {
174 jdavis@postgresql.or 2419 :UNC 0 : int bufid = -buffer - 1;
2420 : :
2421 : 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2422 : : }
2423 : : else
2424 : : {
174 jdavis@postgresql.or 2425 :GNC 13823361 : bufHdr = GetBufferDescriptor(buffer - 1);
2426 : : }
2427 : :
2428 [ - + - + : 13823361 : Assert(BufferIsPinned(buffer));
- + ]
2429 : 13823361 : return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2430 : : LW_EXCLUSIVE);
2431 : : }
2432 : :
2433 : : /*
2434 : : * BufferIsDirty
2435 : : *
2436 : : * Checks if buffer is already dirty.
2437 : : *
2438 : : * Buffer must be pinned and exclusive-locked. (Without an exclusive lock,
2439 : : * the result may be stale before it's returned.)
2440 : : */
2441 : : bool
2442 : 13823361 : BufferIsDirty(Buffer buffer)
2443 : : {
2444 : : BufferDesc *bufHdr;
2445 : :
2446 [ - + ]: 13823361 : if (BufferIsLocal(buffer))
2447 : : {
174 jdavis@postgresql.or 2448 :UNC 0 : int bufid = -buffer - 1;
2449 : :
2450 : 0 : bufHdr = GetLocalBufferDescriptor(bufid);
2451 : : }
2452 : : else
2453 : : {
174 jdavis@postgresql.or 2454 :GNC 13823361 : bufHdr = GetBufferDescriptor(buffer - 1);
2455 : : }
2456 : :
2457 [ - + - + : 13823361 : Assert(BufferIsPinned(buffer));
- + ]
2458 [ - + ]: 13823361 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2459 : : LW_EXCLUSIVE));
2460 : :
2461 : 13823361 : return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2462 : : }
2463 : :
2464 : : /*
2465 : : * MarkBufferDirty
2466 : : *
2467 : : * Marks buffer contents as dirty (actual write happens later).
2468 : : *
2469 : : * Buffer must be pinned and exclusive-locked. (If caller does not hold
2470 : : * exclusive lock, then somebody could be in process of writing the buffer,
2471 : : * leading to risk of bad data written to disk.)
2472 : : */
2473 : : void
6589 tgl@sss.pgh.pa.us 2474 :CBC 20462434 : MarkBufferDirty(Buffer buffer)
2475 : : {
2476 : : BufferDesc *bufHdr;
2477 : : uint32 buf_state;
2478 : : uint32 old_buf_state;
2479 : :
7121 2480 [ - + ]: 20462434 : if (!BufferIsValid(buffer))
4683 peter_e@gmx.net 2481 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
2482 : :
8780 tgl@sss.pgh.pa.us 2483 [ + + ]:CBC 20462434 : if (BufferIsLocal(buffer))
2484 : : {
6589 2485 : 1140952 : MarkLocalBufferDirty(buffer);
7974 bruce@momjian.us 2486 : 1140952 : return;
2487 : : }
2488 : :
3363 andres@anarazel.de 2489 : 19321482 : bufHdr = GetBufferDescriptor(buffer - 1);
2490 : :
3515 2491 [ - + - + : 19321482 : Assert(BufferIsPinned(buffer));
- + ]
2778 simon@2ndQuadrant.co 2492 [ - + ]: 19321482 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
2493 : : LW_EXCLUSIVE));
2494 : :
2926 andres@anarazel.de 2495 : 19321482 : old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2496 : : for (;;)
2497 : : {
2498 [ + + ]: 19321887 : if (old_buf_state & BM_LOCKED)
2499 : 108 : old_buf_state = WaitBufHdrUnlocked(bufHdr);
2500 : :
2501 : 19321887 : buf_state = old_buf_state;
2502 : :
2503 [ - + ]: 19321887 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2504 : 19321887 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2505 : :
2506 [ + + ]: 19321887 : if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2507 : : buf_state))
2508 : 19321482 : break;
2509 : : }
2510 : :
2511 : : /*
2512 : : * If the buffer was not dirty already, do vacuum accounting.
2513 : : */
2514 [ + + ]: 19321482 : if (!(old_buf_state & BM_DIRTY))
2515 : : {
4524 alvherre@alvh.no-ip. 2516 : 594990 : VacuumPageDirty++;
4435 rhaas@postgresql.org 2517 : 594990 : pgBufferUsage.shared_blks_dirtied++;
4524 alvherre@alvh.no-ip. 2518 [ + + ]: 594990 : if (VacuumCostActive)
2519 : 6892 : VacuumCostBalance += VacuumCostPageDirty;
2520 : : }
2521 : : }
2522 : :
2523 : : /*
2524 : : * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
2525 : : *
2526 : : * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
2527 : : * compared to calling the two routines separately. Now it's mainly just
2528 : : * a convenience function. However, if the passed buffer is valid and
2529 : : * already contains the desired block, we just return it as-is; and that
2530 : : * does save considerable work compared to a full release and reacquire.
2531 : : *
2532 : : * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
2533 : : * buffer actually needs to be released. This case is the same as ReadBuffer,
2534 : : * but can save some tests in the caller.
2535 : : */
2536 : : Buffer
10141 scrappy@hub.org 2537 : 24803481 : ReleaseAndReadBuffer(Buffer buffer,
2538 : : Relation relation,
2539 : : BlockNumber blockNum)
2540 : : {
5421 bruce@momjian.us 2541 : 24803481 : ForkNumber forkNum = MAIN_FORKNUM;
2542 : : BufferDesc *bufHdr;
2543 : :
8373 tgl@sss.pgh.pa.us 2544 [ + + ]: 24803481 : if (BufferIsValid(buffer))
2545 : : {
3515 andres@anarazel.de 2546 [ - + + + : 14600072 : Assert(BufferIsPinned(buffer));
- + ]
8373 tgl@sss.pgh.pa.us 2547 [ + + ]: 14600072 : if (BufferIsLocal(buffer))
2548 : : {
3363 andres@anarazel.de 2549 : 105224 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
8345 tgl@sss.pgh.pa.us 2550 [ + + + - ]: 107234 : if (bufHdr->tag.blockNum == blockNum &&
599 rhaas@postgresql.org 2551 [ + - ]: 4020 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2552 : 2010 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
8345 tgl@sss.pgh.pa.us 2553 : 2010 : return buffer;
375 andres@anarazel.de 2554 : 103214 : UnpinLocalBuffer(buffer);
2555 : : }
2556 : : else
2557 : : {
3363 2558 : 14494848 : bufHdr = GetBufferDescriptor(buffer - 1);
2559 : : /* we have pin, so it's ok to examine tag without spinlock */
8345 tgl@sss.pgh.pa.us 2560 [ + + + - ]: 19365605 : if (bufHdr->tag.blockNum == blockNum &&
599 rhaas@postgresql.org 2561 [ + - ]: 9741514 : BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
2562 : 4870757 : BufTagGetForkNum(&bufHdr->tag) == forkNum)
8345 tgl@sss.pgh.pa.us 2563 : 4870757 : return buffer;
562 michael@paquier.xyz 2564 : 9624091 : UnpinBuffer(bufHdr);
2565 : : }
2566 : : }
2567 : :
6981 tgl@sss.pgh.pa.us 2568 : 19930714 : return ReadBuffer(relation, blockNum);
2569 : : }
2570 : :
2571 : : /*
2572 : : * PinBuffer -- make buffer unavailable for replacement.
2573 : : *
2574 : : * For the default access strategy, the buffer's usage_count is incremented
2575 : : * when we first pin it; for other strategies we just make sure the usage_count
2576 : : * isn't zero. (The idea of the latter is that we don't want synchronized
2577 : : * heap scans to inflate the count, but we need it to not be zero to discourage
2578 : : * other backends from stealing buffers from our ring. As long as we cycle
2579 : : * through the ring faster than the global clock-sweep cycles, buffers in
2580 : : * our ring won't be chosen as victims for replacement by other backends.)
2581 : : *
2582 : : * This should be applied only to shared buffers, never local ones.
2583 : : *
2584 : : * Since buffers are pinned/unpinned very frequently, pin buffers without
2585 : : * taking the buffer header lock; instead update the state variable in loop of
2586 : : * CAS operations. Hopefully it's just a single CAS.
2587 : : *
2588 : : * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
2589 : : * must have been done already.
2590 : : *
2591 : : * Returns true if buffer is BM_VALID, else false. This provision allows
2592 : : * some callers to avoid an extra spinlock cycle.
2593 : : */
2594 : : static bool
3072 rhaas@postgresql.org 2595 : 49229176 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
2596 : : {
3168 andres@anarazel.de 2597 : 49229176 : Buffer b = BufferDescriptorGetBuffer(buf);
2598 : : bool result;
2599 : : PrivateRefCountEntry *ref;
2600 : :
375 2601 [ - + ]: 49229176 : Assert(!BufferIsLocal(b));
158 heikki.linnakangas@i 2602 [ - + ]:GNC 49229176 : Assert(ReservedRefCountEntry != NULL);
2603 : :
3168 andres@anarazel.de 2604 :CBC 49229176 : ref = GetPrivateRefCountEntry(b, true);
2605 : :
3373 2606 [ + + ]: 49229176 : if (ref == NULL)
2607 : : {
2608 : : uint32 buf_state;
2609 : : uint32 old_buf_state;
2610 : :
3168 2611 : 47225541 : ref = NewPrivateRefCountEntry(b);
2612 : :
2926 2613 : 47225541 : old_buf_state = pg_atomic_read_u32(&buf->state);
2614 : : for (;;)
2615 : : {
2616 [ + + ]: 47240049 : if (old_buf_state & BM_LOCKED)
2617 : 1074 : old_buf_state = WaitBufHdrUnlocked(buf);
2618 : :
2619 : 47240049 : buf_state = old_buf_state;
2620 : :
2621 : : /* increase refcount */
2622 : 47240049 : buf_state += BUF_REFCOUNT_ONE;
2623 : :
2582 teodor@sigaev.ru 2624 [ + + ]: 47240049 : if (strategy == NULL)
2625 : : {
2626 : : /* Default case: increase usagecount unless already max. */
2627 [ + + ]: 46657503 : if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
2628 : 2649640 : buf_state += BUF_USAGECOUNT_ONE;
2629 : : }
2630 : : else
2631 : : {
2632 : : /*
2633 : : * Ring buffers shouldn't evict others from pool. Thus we
2634 : : * don't make usagecount more than 1.
2635 : : */
2636 [ + + ]: 582546 : if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2637 : 22238 : buf_state += BUF_USAGECOUNT_ONE;
2638 : : }
2639 : :
2926 andres@anarazel.de 2640 [ + + ]: 47240049 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2641 : : buf_state))
2642 : : {
2643 : 47225541 : result = (buf_state & BM_VALID) != 0;
2644 : :
2645 : : /*
2646 : : * Assume that we acquired a buffer pin for the purposes of
2647 : : * Valgrind buffer client checks (even in !result case) to
2648 : : * keep things simple. Buffers that are unsafe to access are
2649 : : * not generally guaranteed to be marked undefined or
2650 : : * non-accessible in any case.
2651 : : */
2652 : : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2653 : 47225541 : break;
2654 : : }
2655 : : }
2656 : : }
2657 : : else
2658 : : {
2659 : : /*
2660 : : * If we previously pinned the buffer, it is likely to be valid, but
2661 : : * it may not be if StartReadBuffers() was called and
2662 : : * WaitReadBuffers() hasn't been called yet. We'll check by loading
2663 : : * the flags without locking. This is racy, but it's OK to return
2664 : : * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2665 : : * it'll see that it's now valid.
2666 : : *
2667 : : * Note: We deliberately avoid a Valgrind client request here.
2668 : : * Individual access methods can optionally superimpose buffer page
2669 : : * client requests on top of our client requests to enforce that
2670 : : * buffers are only accessed while locked (and pinned). It's possible
2671 : : * that the buffer page is legitimately non-accessible here. We
2672 : : * cannot meddle with that.
2673 : : */
11 tmunro@postgresql.or 2674 :GNC 2003635 : result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2675 : : }
2676 : :
3515 andres@anarazel.de 2677 :CBC 49229176 : ref->refcount++;
2678 [ - + ]: 49229176 : Assert(ref->refcount > 0);
3168 2679 : 49229176 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
6981 tgl@sss.pgh.pa.us 2680 : 49229176 : return result;
2681 : : }
2682 : :
2683 : : /*
2684 : : * PinBuffer_Locked -- as above, but caller already locked the buffer header.
2685 : : * The spinlock is released before return.
2686 : : *
2687 : : * As this function is called with the spinlock held, the caller has to
2688 : : * previously call ReservePrivateRefCountEntry() and
2689 : : * ResourceOwnerEnlarge(CurrentResourceOwner);
2690 : : *
2691 : : * Currently, no callers of this function want to modify the buffer's
2692 : : * usage_count at all, so there's no need for a strategy parameter.
2693 : : * Also we don't bother with a BM_VALID test (the caller could check that for
2694 : : * itself).
2695 : : *
2696 : : * Also all callers only ever use this function when it's known that the
2697 : : * buffer can't have a preexisting pin by this backend. That allows us to skip
2698 : : * searching the private refcount array & hash, which is a boon, because the
2699 : : * spinlock is still held.
2700 : : *
2701 : : * Note: use of this routine is frequently mandatory, not just an optimization
2702 : : * to save a spin lock/unlock cycle, because we need to pin a buffer before
2703 : : * its state can change under us.
2704 : : */
2705 : : static void
3072 rhaas@postgresql.org 2706 : 2381770 : PinBuffer_Locked(BufferDesc *buf)
2707 : : {
2708 : : Buffer b;
2709 : : PrivateRefCountEntry *ref;
2710 : : uint32 buf_state;
2711 : :
2712 : : /*
2713 : : * As explained, We don't expect any preexisting pins. That allows us to
2714 : : * manipulate the PrivateRefCount after releasing the spinlock
2715 : : */
3168 andres@anarazel.de 2716 [ - + ]: 2381770 : Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
2717 : :
2718 : : /*
2719 : : * Buffer can't have a preexisting pin, so mark its page as defined to
2720 : : * Valgrind (this is similar to the PinBuffer() case where the backend
2721 : : * doesn't already have a buffer pin)
2722 : : */
2723 : : VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
2724 : :
2725 : : /*
2726 : : * Since we hold the buffer spinlock, we can update the buffer state and
2727 : : * release the lock in one operation.
2728 : : */
2926 2729 : 2381770 : buf_state = pg_atomic_read_u32(&buf->state);
2730 [ - + ]: 2381770 : Assert(buf_state & BM_LOCKED);
2731 : 2381770 : buf_state += BUF_REFCOUNT_ONE;
2732 : 2381770 : UnlockBufHdr(buf, buf_state);
2733 : :
3168 2734 : 2381770 : b = BufferDescriptorGetBuffer(buf);
2735 : :
2736 : 2381770 : ref = NewPrivateRefCountEntry(b);
3515 2737 : 2381770 : ref->refcount++;
2738 : :
3168 2739 : 2381770 : ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
7300 tgl@sss.pgh.pa.us 2740 : 2381770 : }
2741 : :
2742 : : /*
2743 : : * UnpinBuffer -- make buffer available for replacement.
2744 : : *
2745 : : * This should be applied only to shared buffers, never local ones. This
2746 : : * always adjusts CurrentResourceOwner.
2747 : : */
2748 : : static void
562 michael@paquier.xyz 2749 : 60818830 : UnpinBuffer(BufferDesc *buf)
2750 : : {
158 heikki.linnakangas@i 2751 :GNC 60818830 : Buffer b = BufferDescriptorGetBuffer(buf);
2752 : :
2753 : 60818830 : ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
2754 : 60818830 : UnpinBufferNoOwner(buf);
2755 : 60818830 : }
2756 : :
2757 : : static void
2758 : 60822581 : UnpinBufferNoOwner(BufferDesc *buf)
2759 : : {
2760 : : PrivateRefCountEntry *ref;
3168 andres@anarazel.de 2761 :CBC 60822581 : Buffer b = BufferDescriptorGetBuffer(buf);
2762 : :
375 2763 [ - + ]: 60822581 : Assert(!BufferIsLocal(b));
2764 : :
2765 : : /* not moving as we're likely deleting it soon anyway */
3168 2766 : 60822581 : ref = GetPrivateRefCountEntry(b, false);
3515 2767 [ - + ]: 60822581 : Assert(ref != NULL);
2768 [ - + ]: 60822581 : Assert(ref->refcount > 0);
2769 : 60822581 : ref->refcount--;
2770 [ + + ]: 60822581 : if (ref->refcount == 0)
2771 : : {
2772 : : uint32 buf_state;
2773 : : uint32 old_buf_state;
2774 : :
2775 : : /*
2776 : : * Mark buffer non-accessible to Valgrind.
2777 : : *
2778 : : * Note that the buffer may have already been marked non-accessible
2779 : : * within access method code that enforces that buffers are only
2780 : : * accessed while a buffer lock is held.
2781 : : */
2782 : : VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
2783 : :
2784 : : /* I'd better not still hold the buffer content lock */
3043 rhaas@postgresql.org 2785 [ - + ]: 49607311 : Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
2786 : :
2787 : : /*
2788 : : * Decrement the shared reference count.
2789 : : *
2790 : : * Since buffer spinlock holder can update status using just write,
2791 : : * it's not safe to use atomic decrement here; thus use a CAS loop.
2792 : : */
2926 andres@anarazel.de 2793 : 49607311 : old_buf_state = pg_atomic_read_u32(&buf->state);
2794 : : for (;;)
2795 : : {
2796 [ + + ]: 49622418 : if (old_buf_state & BM_LOCKED)
2797 : 952 : old_buf_state = WaitBufHdrUnlocked(buf);
2798 : :
2799 : 49622418 : buf_state = old_buf_state;
2800 : :
2801 : 49622418 : buf_state -= BUF_REFCOUNT_ONE;
2802 : :
2803 [ + + ]: 49622418 : if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2804 : : buf_state))
2805 : 49607311 : break;
2806 : : }
2807 : :
2808 : : /* Support LockBufferForCleanup() */
2809 [ + + ]: 49607311 : if (buf_state & BM_PIN_COUNT_WAITER)
2810 : : {
2811 : : /*
2812 : : * Acquire the buffer header lock, re-check that there's a waiter.
2813 : : * Another backend could have unpinned this buffer, and already
2814 : : * woken up the waiter. There's no danger of the buffer being
2815 : : * replaced after we unpinned it above, as it's pinned by the
2816 : : * waiter.
2817 : : */
2926 andres@anarazel.de 2818 :GBC 2 : buf_state = LockBufHdr(buf);
2819 : :
2820 [ + - ]: 2 : if ((buf_state & BM_PIN_COUNT_WAITER) &&
2821 [ + - ]: 2 : BUF_STATE_GET_REFCOUNT(buf_state) == 1)
2822 : 2 : {
2823 : : /* we just released the last pin other than the waiter's */
850 tmunro@postgresql.or 2824 : 2 : int wait_backend_pgprocno = buf->wait_backend_pgprocno;
2825 : :
2926 andres@anarazel.de 2826 : 2 : buf_state &= ~BM_PIN_COUNT_WAITER;
2827 : 2 : UnlockBufHdr(buf, buf_state);
850 tmunro@postgresql.or 2828 : 2 : ProcSendSignal(wait_backend_pgprocno);
2829 : : }
2830 : : else
2926 andres@anarazel.de 2831 :UBC 0 : UnlockBufHdr(buf, buf_state);
2832 : : }
3515 andres@anarazel.de 2833 :CBC 49607311 : ForgetPrivateRefCountEntry(ref);
2834 : : }
7300 tgl@sss.pgh.pa.us 2835 : 60822581 : }
2836 : :
2837 : : #define ST_SORT sort_checkpoint_bufferids
2838 : : #define ST_ELEMENT_TYPE CkptSortItem
2839 : : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
2840 : : #define ST_SCOPE static
2841 : : #define ST_DEFINE
2842 : : #include <lib/sort_template.h>
2843 : :
2844 : : /*
2845 : : * BufferSync -- Write out all dirty buffers in the pool.
2846 : : *
2847 : : * This is called at checkpoint time to write out all dirty shared buffers.
2848 : : * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
2849 : : * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
2850 : : * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
2851 : : * unlogged buffers, which are otherwise skipped. The remaining flags
2852 : : * currently have no effect here.
2853 : : */
2854 : : static void
6135 2855 : 1153 : BufferSync(int flags)
2856 : : {
2857 : : uint32 buf_state;
2858 : : int buf_id;
2859 : : int num_to_scan;
2860 : : int num_spaces;
2861 : : int num_processed;
2862 : : int num_written;
2977 andres@anarazel.de 2863 : 1153 : CkptTsStatus *per_ts_stat = NULL;
2864 : : Oid last_tsid;
2865 : : binaryheap *ts_heap;
2866 : : int i;
4855 rhaas@postgresql.org 2867 : 1153 : int mask = BM_DIRTY;
2868 : : WritebackContext wb_context;
2869 : :
2870 : : /*
2871 : : * Unless this is a shutdown checkpoint or we have been explicitly told,
2872 : : * we write only permanent, dirty buffers. But at shutdown or end of
2873 : : * recovery, we write all dirty buffers.
2874 : : */
3464 andres@anarazel.de 2875 [ + + ]: 1153 : if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
2876 : : CHECKPOINT_FLUSH_ALL))))
4482 rhaas@postgresql.org 2877 : 470 : mask |= BM_PERMANENT;
2878 : :
2879 : : /*
2880 : : * Loop over all buffers, and mark the ones that need to be written with
2881 : : * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2882 : : * can estimate how much work needs to be done.
2883 : : *
2884 : : * This allows us to write only those pages that were dirty when the
2885 : : * checkpoint began, and not those that get dirtied while it proceeds.
2886 : : * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2887 : : * later in this function, or by normal backends or the bgwriter cleaning
2888 : : * scan, the flag is cleared. Any buffer dirtied after this point won't
2889 : : * have the flag set.
2890 : : *
2891 : : * Note that if we fail to write some buffer, we may leave buffers with
2892 : : * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2893 : : * certainly need to be written for the next checkpoint attempt, too.
2894 : : */
2977 andres@anarazel.de 2895 : 1153 : num_to_scan = 0;
6135 tgl@sss.pgh.pa.us 2896 [ + + ]: 10698353 : for (buf_id = 0; buf_id < NBuffers; buf_id++)
2897 : : {
3072 rhaas@postgresql.org 2898 : 10697200 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2899 : :
2900 : : /*
2901 : : * Header spinlock is enough to examine BM_DIRTY, see comment in
2902 : : * SyncOneBuffer.
2903 : : */
2926 andres@anarazel.de 2904 : 10697200 : buf_state = LockBufHdr(bufHdr);
2905 : :
2906 [ + + ]: 10697200 : if ((buf_state & mask) == mask)
2907 : : {
2908 : : CkptSortItem *item;
2909 : :
2910 : 241920 : buf_state |= BM_CHECKPOINT_NEEDED;
2911 : :
2977 2912 : 241920 : item = &CkptBufferIds[num_to_scan++];
2913 : 241920 : item->buf_id = buf_id;
599 rhaas@postgresql.org 2914 : 241920 : item->tsId = bufHdr->tag.spcOid;
2915 : 241920 : item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2916 : 241920 : item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2977 andres@anarazel.de 2917 : 241920 : item->blockNum = bufHdr->tag.blockNum;
2918 : : }
2919 : :
2926 2920 : 10697200 : UnlockBufHdr(bufHdr, buf_state);
2921 : :
2922 : : /* Check for barrier events in case NBuffers is large. */
1578 rhaas@postgresql.org 2923 [ - + ]: 10697200 : if (ProcSignalBarrierPending)
1578 rhaas@postgresql.org 2924 :UBC 0 : ProcessProcSignalBarrier();
2925 : : }
2926 : :
2977 andres@anarazel.de 2927 [ + + ]:CBC 1153 : if (num_to_scan == 0)
6135 tgl@sss.pgh.pa.us 2928 : 399 : return; /* nothing to do */
2929 : :
2977 andres@anarazel.de 2930 : 754 : WritebackContextInit(&wb_context, &checkpoint_flush_after);
2931 : :
2932 : : TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2933 : :
2934 : : /*
2935 : : * Sort buffers that need to be written to reduce the likelihood of random
2936 : : * IO. The sorting is also important for the implementation of balancing
2937 : : * writes between tablespaces. Without balancing writes we'd potentially
2938 : : * end up writing to the tablespaces one-by-one; possibly overloading the
2939 : : * underlying system.
2940 : : */
1129 tmunro@postgresql.or 2941 : 754 : sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2942 : :
2977 andres@anarazel.de 2943 : 754 : num_spaces = 0;
2944 : :
2945 : : /*
2946 : : * Allocate progress status for each tablespace with buffers that need to
2947 : : * be flushed. This requires the to-be-flushed array to be sorted.
2948 : : */
2949 : 754 : last_tsid = InvalidOid;
2950 [ + + ]: 242674 : for (i = 0; i < num_to_scan; i++)
2951 : : {
2952 : : CkptTsStatus *s;
2953 : : Oid cur_tsid;
2954 : :
2955 : 241920 : cur_tsid = CkptBufferIds[i].tsId;
2956 : :
2957 : : /*
2958 : : * Grow array of per-tablespace status structs, every time a new
2959 : : * tablespace is found.
2960 : : */
2961 [ + + + + ]: 241920 : if (last_tsid == InvalidOid || last_tsid != cur_tsid)
2962 : 1129 : {
2963 : : Size sz;
2964 : :
2965 : 1129 : num_spaces++;
2966 : :
2967 : : /*
2968 : : * Not worth adding grow-by-power-of-2 logic here - even with a
2969 : : * few hundred tablespaces this should be fine.
2970 : : */
2971 : 1129 : sz = sizeof(CkptTsStatus) * num_spaces;
2972 : :
2973 [ + + ]: 1129 : if (per_ts_stat == NULL)
2974 : 754 : per_ts_stat = (CkptTsStatus *) palloc(sz);
2975 : : else
2976 : 375 : per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
2977 : :
2978 : 1129 : s = &per_ts_stat[num_spaces - 1];
2979 : 1129 : memset(s, 0, sizeof(*s));
2980 : 1129 : s->tsId = cur_tsid;
2981 : :
2982 : : /*
2983 : : * The first buffer in this tablespace. As CkptBufferIds is sorted
2984 : : * by tablespace all (s->num_to_scan) buffers in this tablespace
2985 : : * will follow afterwards.
2986 : : */
2987 : 1129 : s->index = i;
2988 : :
2989 : : /*
2990 : : * progress_slice will be determined once we know how many buffers
2991 : : * are in each tablespace, i.e. after this loop.
2992 : : */
2993 : :
2994 : 1129 : last_tsid = cur_tsid;
2995 : : }
2996 : : else
2997 : : {
2998 : 240791 : s = &per_ts_stat[num_spaces - 1];
2999 : : }
3000 : :
3001 : 241920 : s->num_to_scan++;
3002 : :
3003 : : /* Check for barrier events. */
1578 rhaas@postgresql.org 3004 [ - + ]: 241920 : if (ProcSignalBarrierPending)
1578 rhaas@postgresql.org 3005 :UBC 0 : ProcessProcSignalBarrier();
3006 : : }
3007 : :
2977 andres@anarazel.de 3008 [ - + ]:CBC 754 : Assert(num_spaces > 0);
3009 : :
3010 : : /*
3011 : : * Build a min-heap over the write-progress in the individual tablespaces,
3012 : : * and compute how large a portion of the total progress a single
3013 : : * processed buffer is.
3014 : : */
3015 : 754 : ts_heap = binaryheap_allocate(num_spaces,
3016 : : ts_ckpt_progress_comparator,
3017 : : NULL);
3018 : :
3019 [ + + ]: 1883 : for (i = 0; i < num_spaces; i++)
3020 : : {
3021 : 1129 : CkptTsStatus *ts_stat = &per_ts_stat[i];
3022 : :
3023 : 1129 : ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3024 : :
3025 : 1129 : binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3026 : : }
3027 : :
3028 : 754 : binaryheap_build(ts_heap);
3029 : :
3030 : : /*
3031 : : * Iterate through to-be-checkpointed buffers and write the ones (still)
3032 : : * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3033 : : * tablespaces; otherwise the sorting would lead to only one tablespace
3034 : : * receiving writes at a time, making inefficient use of the hardware.
3035 : : */
3036 : 754 : num_processed = 0;
6135 tgl@sss.pgh.pa.us 3037 : 754 : num_written = 0;
2977 andres@anarazel.de 3038 [ + + ]: 242556 : while (!binaryheap_empty(ts_heap))
3039 : : {
3040 : 241807 : BufferDesc *bufHdr = NULL;
3041 : : CkptTsStatus *ts_stat = (CkptTsStatus *)
331 tgl@sss.pgh.pa.us 3042 : 241807 : DatumGetPointer(binaryheap_first(ts_heap));
3043 : :
2977 andres@anarazel.de 3044 : 241807 : buf_id = CkptBufferIds[ts_stat->index].buf_id;
3045 [ - + ]: 241807 : Assert(buf_id != -1);
3046 : :
3047 : 241807 : bufHdr = GetBufferDescriptor(buf_id);
3048 : :
3049 : 241807 : num_processed++;
3050 : :
3051 : : /*
3052 : : * We don't need to acquire the lock here, because we're only looking
3053 : : * at a single bit. It's possible that someone else writes the buffer
3054 : : * and clears the flag right after we check, but that doesn't matter
3055 : : * since SyncOneBuffer will then do nothing. However, there is a
3056 : : * further race condition: it's conceivable that between the time we
3057 : : * examine the bit here and the time SyncOneBuffer acquires the lock,
3058 : : * someone else not only wrote the buffer but replaced it with another
3059 : : * page and dirtied it. In that improbable case, SyncOneBuffer will
3060 : : * write the buffer though we didn't need to. It doesn't seem worth
3061 : : * guarding against this, though.
3062 : : */
2926 3063 [ + + ]: 241807 : if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
3064 : : {
2977 3065 [ + - ]: 239677 : if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3066 : : {
3067 : : TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
167 michael@paquier.xyz 3068 :GNC 239677 : PendingCheckpointerStats.buffers_written++;
6135 tgl@sss.pgh.pa.us 3069 :CBC 239677 : num_written++;
3070 : : }
3071 : : }
3072 : :
3073 : : /*
3074 : : * Measure progress independent of actually having to flush the buffer
3075 : : * - otherwise writing become unbalanced.
3076 : : */
2977 andres@anarazel.de 3077 : 241807 : ts_stat->progress += ts_stat->progress_slice;
3078 : 241807 : ts_stat->num_scanned++;
3079 : 241807 : ts_stat->index++;
3080 : :
3081 : : /* Have all the buffers from the tablespace been processed? */
3082 [ + + ]: 241807 : if (ts_stat->num_scanned == ts_stat->num_to_scan)
3083 : : {
3084 : 1125 : binaryheap_remove_first(ts_heap);
3085 : : }
3086 : : else
3087 : : {
3088 : : /* update heap with the new progress */
3089 : 240682 : binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3090 : : }
3091 : :
3092 : : /*
3093 : : * Sleep to throttle our I/O rate.
3094 : : *
3095 : : * (This will check for barrier events even if it doesn't sleep.)
3096 : : */
3097 : 241807 : CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3098 : : }
3099 : :
3100 : : /*
3101 : : * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3102 : : * IOContext will always be IOCONTEXT_NORMAL.
3103 : : */
333 3104 : 749 : IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
3105 : :
2977 3106 : 749 : pfree(per_ts_stat);
3107 : 749 : per_ts_stat = NULL;
3108 : 749 : binaryheap_free(ts_heap);
3109 : :
3110 : : /*
3111 : : * Update checkpoint statistics. As noted above, this doesn't include
3112 : : * buffers written by other backends or bgwriter scan.
3113 : : */
6133 tgl@sss.pgh.pa.us 3114 : 749 : CheckpointStats.ckpt_bufs_written += num_written;
3115 : :
3116 : : TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3117 : : }
3118 : :
3119 : : /*
3120 : : * BgBufferSync -- Write out some dirty buffers in the pool.
3121 : : *
3122 : : * This is called periodically by the background writer process.
3123 : : *
3124 : : * Returns true if it's appropriate for the bgwriter process to go into
3125 : : * low-power hibernation mode. (This happens if the strategy clock sweep
3126 : : * has been "lapped" and no buffer allocations have occurred recently,
3127 : : * or if the bgwriter has been effectively disabled by setting
3128 : : * bgwriter_lru_maxpages to 0.)
3129 : : */
3130 : : bool
2977 andres@anarazel.de 3131 : 15299 : BgBufferSync(WritebackContext *wb_context)
3132 : : {
3133 : : /* info obtained from freelist.c */
3134 : : int strategy_buf_id;
3135 : : uint32 strategy_passes;
3136 : : uint32 recent_alloc;
3137 : :
3138 : : /*
3139 : : * Information saved between calls so we can determine the strategy
3140 : : * point's advance rate and avoid scanning already-cleaned buffers.
3141 : : */
3142 : : static bool saved_info_valid = false;
3143 : : static int prev_strategy_buf_id;
3144 : : static uint32 prev_strategy_passes;
3145 : : static int next_to_clean;
3146 : : static uint32 next_passes;
3147 : :
3148 : : /* Moving averages of allocation rate and clean-buffer density */
3149 : : static float smoothed_alloc = 0;
3150 : : static float smoothed_density = 10.0;
3151 : :
3152 : : /* Potentially these could be tunables, but for now, not */
6046 tgl@sss.pgh.pa.us 3153 : 15299 : float smoothing_samples = 16;
3154 : 15299 : float scan_whole_pool_milliseconds = 120000.0;
3155 : :
3156 : : /* Used to compute how far we scan ahead */
3157 : : long strategy_delta;
3158 : : int bufs_to_lap;
3159 : : int bufs_ahead;
3160 : : float scans_per_alloc;
3161 : : int reusable_buffers_est;
3162 : : int upcoming_alloc_est;
3163 : : int min_scan_buffers;
3164 : :
3165 : : /* Variables for the scanning loop proper */
3166 : : int num_to_scan;
3167 : : int num_written;
3168 : : int reusable_buffers;
3169 : :
3170 : : /* Variables for final smoothed_density update */
3171 : : long new_strategy_delta;
3172 : : uint32 new_recent_alloc;
3173 : :
3174 : : /*
3175 : : * Find out where the freelist clock sweep currently is, and how many
3176 : : * buffer allocations have happened since our last call.
3177 : : */
3178 : 15299 : strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3179 : :
3180 : : /* Report buffer alloc counts to pgstat */
739 andres@anarazel.de 3181 : 15299 : PendingBgWriterStats.buf_alloc += recent_alloc;
3182 : :
3183 : : /*
3184 : : * If we're not running the LRU scan, just stop after doing the stats
3185 : : * stuff. We mark the saved state invalid so that we can recover sanely
3186 : : * if LRU scan is turned back on later.
3187 : : */
6046 tgl@sss.pgh.pa.us 3188 [ + + ]: 15299 : if (bgwriter_lru_maxpages <= 0)
3189 : : {
6046 tgl@sss.pgh.pa.us 3190 :GBC 33 : saved_info_valid = false;
4462 heikki.linnakangas@i 3191 : 33 : return true;
3192 : : }
3193 : :
3194 : : /*
3195 : : * Compute strategy_delta = how many buffers have been scanned by the
3196 : : * clock sweep since last time. If first time through, assume none. Then
3197 : : * see if we are still ahead of the clock sweep, and if so, how many
3198 : : * buffers we could scan before we'd catch up with it and "lap" it. Note:
3199 : : * weird-looking coding of xxx_passes comparisons are to avoid bogus
3200 : : * behavior when the passes counts wrap around.
3201 : : */
6046 tgl@sss.pgh.pa.us 3202 [ + + ]:CBC 15266 : if (saved_info_valid)
3203 : : {
5995 bruce@momjian.us 3204 : 14536 : int32 passes_delta = strategy_passes - prev_strategy_passes;
3205 : :
6046 tgl@sss.pgh.pa.us 3206 : 14536 : strategy_delta = strategy_buf_id - prev_strategy_buf_id;
2489 3207 : 14536 : strategy_delta += (long) passes_delta * NBuffers;
3208 : :
6046 3209 [ - + ]: 14536 : Assert(strategy_delta >= 0);
3210 : :
3211 [ + + ]: 14536 : if ((int32) (next_passes - strategy_passes) > 0)
3212 : : {
3213 : : /* we're one pass ahead of the strategy point */
3214 : 3193 : bufs_to_lap = strategy_buf_id - next_to_clean;
3215 : : #ifdef BGW_DEBUG
3216 : : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3217 : : next_passes, next_to_clean,
3218 : : strategy_passes, strategy_buf_id,
3219 : : strategy_delta, bufs_to_lap);
3220 : : #endif
3221 : : }
3222 [ + + ]: 11343 : else if (next_passes == strategy_passes &&
3223 [ + + ]: 9018 : next_to_clean >= strategy_buf_id)
3224 : : {
3225 : : /* on same pass, but ahead or at least not behind */
3226 : 8757 : bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3227 : : #ifdef BGW_DEBUG
3228 : : elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3229 : : next_passes, next_to_clean,
3230 : : strategy_passes, strategy_buf_id,
3231 : : strategy_delta, bufs_to_lap);
3232 : : #endif
3233 : : }
3234 : : else
3235 : : {
3236 : : /*
3237 : : * We're behind, so skip forward to the strategy point and start
3238 : : * cleaning from there.
3239 : : */
3240 : : #ifdef BGW_DEBUG
3241 : : elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3242 : : next_passes, next_to_clean,
3243 : : strategy_passes, strategy_buf_id,
3244 : : strategy_delta);
3245 : : #endif
3246 : 2586 : next_to_clean = strategy_buf_id;
3247 : 2586 : next_passes = strategy_passes;
3248 : 2586 : bufs_to_lap = NBuffers;
3249 : : }
3250 : : }
3251 : : else
3252 : : {
3253 : : /*
3254 : : * Initializing at startup or after LRU scanning had been off. Always
3255 : : * start at the strategy point.
3256 : : */
3257 : : #ifdef BGW_DEBUG
3258 : : elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3259 : : strategy_passes, strategy_buf_id);
3260 : : #endif
3261 : 730 : strategy_delta = 0;
3262 : 730 : next_to_clean = strategy_buf_id;
3263 : 730 : next_passes = strategy_passes;
3264 : 730 : bufs_to_lap = NBuffers;
3265 : : }
3266 : :
3267 : : /* Update saved info for next time */
3268 : 15266 : prev_strategy_buf_id = strategy_buf_id;
3269 : 15266 : prev_strategy_passes = strategy_passes;
3270 : 15266 : saved_info_valid = true;
3271 : :
3272 : : /*
3273 : : * Compute how many buffers had to be scanned for each new allocation, ie,
3274 : : * 1/density of reusable buffers, and track a moving average of that.
3275 : : *
3276 : : * If the strategy point didn't move, we don't update the density estimate
3277 : : */
3278 [ + + + - ]: 15266 : if (strategy_delta > 0 && recent_alloc > 0)
3279 : : {
3280 : 3479 : scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3281 : 3479 : smoothed_density += (scans_per_alloc - smoothed_density) /
3282 : : smoothing_samples;
3283 : : }
3284 : :
3285 : : /*
3286 : : * Estimate how many reusable buffers there are between the current
3287 : : * strategy point and where we've scanned ahead to, based on the smoothed
3288 : : * density estimate.
3289 : : */
3290 : 15266 : bufs_ahead = NBuffers - bufs_to_lap;
3291 : 15266 : reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3292 : :
3293 : : /*
3294 : : * Track a moving average of recent buffer allocations. Here, rather than
3295 : : * a true average we want a fast-attack, slow-decline behavior: we
3296 : : * immediately follow any increase.
3297 : : */
3298 [ + + ]: 15266 : if (smoothed_alloc <= (float) recent_alloc)
3299 : 3500 : smoothed_alloc = recent_alloc;
3300 : : else
3301 : 11766 : smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3302 : : smoothing_samples;
3303 : :
3304 : : /* Scale the estimate by a GUC to allow more aggressive tuning. */
4530 3305 : 15266 : upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3306 : :
3307 : : /*
3308 : : * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3309 : : * eventually underflow to zero, and the underflows produce annoying
3310 : : * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3311 : : * zero, there's no point in tracking smaller and smaller values of
3312 : : * smoothed_alloc, so just reset it to exactly zero to avoid this
3313 : : * syndrome. It will pop back up as soon as recent_alloc increases.
3314 : : */
3315 [ + + ]: 15266 : if (upcoming_alloc_est == 0)
3316 : 1841 : smoothed_alloc = 0;
3317 : :
3318 : : /*
3319 : : * Even in cases where there's been little or no buffer allocation
3320 : : * activity, we want to make a small amount of progress through the buffer
3321 : : * cache so that as many reusable buffers as possible are clean after an
3322 : : * idle period.
3323 : : *
3324 : : * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3325 : : * the BGW will be called during the scan_whole_pool time; slice the
3326 : : * buffer pool into that many sections.
3327 : : */
6046 3328 : 15266 : min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3329 : :
3330 [ + + ]: 15266 : if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3331 : : {
3332 : : #ifdef BGW_DEBUG
3333 : : elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3334 : : upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3335 : : #endif
3336 : 7838 : upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3337 : : }
3338 : :
3339 : : /*
3340 : : * Now write out dirty reusable buffers, working forward from the
3341 : : * next_to_clean point, until we have lapped the strategy scan, or cleaned
3342 : : * enough buffers to match our estimate of the next cycle's allocation
3343 : : * requirements, or hit the bgwriter_lru_maxpages limit.
3344 : : */
3345 : :
3346 : 15266 : num_to_scan = bufs_to_lap;
3347 : 15266 : num_written = 0;
3348 : 15266 : reusable_buffers = reusable_buffers_est;
3349 : :
3350 : : /* Execute the LRU scan */
3351 [ + + + + ]: 1679728 : while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3352 : : {
2926 andres@anarazel.de 3353 : 1664473 : int sync_state = SyncOneBuffer(next_to_clean, true,
3354 : : wb_context);
3355 : :
6046 tgl@sss.pgh.pa.us 3356 [ + + ]: 1664473 : if (++next_to_clean >= NBuffers)
3357 : : {
3358 : 3065 : next_to_clean = 0;
3359 : 3065 : next_passes++;
3360 : : }
3361 : 1664473 : num_to_scan--;
3362 : :
2926 andres@anarazel.de 3363 [ + + ]: 1664473 : if (sync_state & BUF_WRITTEN)
3364 : : {
6046 tgl@sss.pgh.pa.us 3365 : 34633 : reusable_buffers++;
3366 [ + + ]: 34633 : if (++num_written >= bgwriter_lru_maxpages)
3367 : : {
739 andres@anarazel.de 3368 : 11 : PendingBgWriterStats.maxwritten_clean++;
6046 tgl@sss.pgh.pa.us 3369 : 11 : break;
3370 : : }
3371 : : }
2926 andres@anarazel.de 3372 [ + + ]: 1629840 : else if (sync_state & BUF_REUSABLE)
6046 tgl@sss.pgh.pa.us 3373 : 1177703 : reusable_buffers++;
3374 : : }
3375 : :
739 andres@anarazel.de 3376 : 15266 : PendingBgWriterStats.buf_written_clean += num_written;
3377 : :
3378 : : #ifdef BGW_DEBUG
3379 : : elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3380 : : recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3381 : : smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3382 : : bufs_to_lap - num_to_scan,
3383 : : num_written,
3384 : : reusable_buffers - reusable_buffers_est);
3385 : : #endif
3386 : :
3387 : : /*
3388 : : * Consider the above scan as being like a new allocation scan.
3389 : : * Characterize its density and update the smoothed one based on it. This
3390 : : * effectively halves the moving average period in cases where both the
3391 : : * strategy and the background writer are doing some useful scanning,
3392 : : * which is helpful because a long memory isn't as desirable on the
3393 : : * density estimates.
3394 : : */
4358 tgl@sss.pgh.pa.us 3395 : 15266 : new_strategy_delta = bufs_to_lap - num_to_scan;
3396 : 15266 : new_recent_alloc = reusable_buffers - reusable_buffers_est;
3397 [ + + + + ]: 15266 : if (new_strategy_delta > 0 && new_recent_alloc > 0)
3398 : : {
3399 : 11156 : scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
6046 3400 : 11156 : smoothed_density += (scans_per_alloc - smoothed_density) /
3401 : : smoothing_samples;
3402 : :
3403 : : #ifdef BGW_DEBUG
3404 : : elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3405 : : new_recent_alloc, new_strategy_delta,
3406 : : scans_per_alloc, smoothed_density);
3407 : : #endif
3408 : : }
3409 : :
3410 : : /* Return true if OK to hibernate */
4358 3411 [ + + + + ]: 15266 : return (bufs_to_lap == 0 && recent_alloc == 0);
3412 : : }
3413 : :
3414 : : /*
3415 : : * SyncOneBuffer -- process a single buffer during syncing.
3416 : : *
3417 : : * If skip_recently_used is true, we don't write currently-pinned buffers, nor
3418 : : * buffers marked recently used, as these are not replacement candidates.
3419 : : *
3420 : : * Returns a bitmask containing the following flag bits:
3421 : : * BUF_WRITTEN: we wrote the buffer.
3422 : : * BUF_REUSABLE: buffer is available for replacement, ie, it has
3423 : : * pin count 0 and usage count 0.
3424 : : *
3425 : : * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
3426 : : * after locking it, but we don't care all that much.)
3427 : : */
3428 : : static int
2977 andres@anarazel.de 3429 : 1904150 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
3430 : : {
3072 rhaas@postgresql.org 3431 : 1904150 : BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
5995 bruce@momjian.us 3432 : 1904150 : int result = 0;
3433 : : uint32 buf_state;
3434 : : BufferTag tag;
3435 : :
3436 : : /* Make sure we can handle the pin */
3373 andres@anarazel.de 3437 : 1904150 : ReservePrivateRefCountEntry();
158 heikki.linnakangas@i 3438 :GNC 1904150 : ResourceOwnerEnlarge(CurrentResourceOwner);
3439 : :
3440 : : /*
3441 : : * Check whether buffer needs writing.
3442 : : *
3443 : : * We can make this check without taking the buffer content lock so long
3444 : : * as we mark pages dirty in access methods *before* logging changes with
3445 : : * XLogInsert(): if someone marks the buffer dirty just after our check we
3446 : : * don't worry because our checkpoint.redo points before log record for
3447 : : * upcoming changes and so we are not required to write such dirty buffer.
3448 : : */
2926 andres@anarazel.de 3449 :CBC 1904150 : buf_state = LockBufHdr(bufHdr);
3450 : :
3451 [ + + ]: 1904150 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3452 [ + + ]: 1901537 : BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3453 : : {
6046 tgl@sss.pgh.pa.us 3454 : 1213011 : result |= BUF_REUSABLE;
3455 : : }
3456 [ + + ]: 691139 : else if (skip_recently_used)
3457 : : {
3458 : : /* Caller told us not to write recently-used buffers */
2926 andres@anarazel.de 3459 : 452137 : UnlockBufHdr(bufHdr, buf_state);
6046 tgl@sss.pgh.pa.us 3460 : 452137 : return result;
3461 : : }
3462 : :
2926 andres@anarazel.de 3463 [ + + + + ]: 1452013 : if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3464 : : {
3465 : : /* It's clean, so nothing to do */
3466 : 1177703 : UnlockBufHdr(bufHdr, buf_state);
6046 tgl@sss.pgh.pa.us 3467 : 1177703 : return result;
3468 : : }
3469 : :
3470 : : /*
3471 : : * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3472 : : * buffer is clean by the time we've locked it.)
3473 : : */
6981 3474 : 274310 : PinBuffer_Locked(bufHdr);
3043 rhaas@postgresql.org 3475 : 274310 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
3476 : :
430 andres@anarazel.de 3477 : 274310 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3478 : :
3043 rhaas@postgresql.org 3479 : 274310 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
3480 : :
2977 andres@anarazel.de 3481 : 274310 : tag = bufHdr->tag;
3482 : :
562 michael@paquier.xyz 3483 : 274310 : UnpinBuffer(bufHdr);
3484 : :
3485 : : /*
3486 : : * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3487 : : * IOContext will always be IOCONTEXT_NORMAL.
3488 : : */
333 andres@anarazel.de 3489 : 274310 : ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
3490 : :
6046 tgl@sss.pgh.pa.us 3491 : 274310 : return result | BUF_WRITTEN;
3492 : : }
3493 : :
3494 : : /*
3495 : : * AtEOXact_Buffers - clean up at end of transaction.
3496 : : *
3497 : : * As of PostgreSQL 8.0, buffer pins should get released by the
3498 : : * ResourceOwner mechanism. This routine is just a debugging
3499 : : * cross-check that no pins remain.
3500 : : */
3501 : : void
7922 3502 : 432909 : AtEOXact_Buffers(bool isCommit)
3503 : : {
3586 andres@anarazel.de 3504 : 432909 : CheckForBufferLeaks();
3505 : :
7120 tgl@sss.pgh.pa.us 3506 : 432909 : AtEOXact_LocalBuffers(isCommit);
3507 : :
3515 andres@anarazel.de 3508 [ - + ]: 432909 : Assert(PrivateRefCountOverflowed == 0);
3509 : 432909 : }
3510 : :
3511 : : /*
3512 : : * Initialize access to shared buffer pool
3513 : : *
3514 : : * This is called during backend startup (whether standalone or under the
3515 : : * postmaster). It sets up for this backend's access to the already-existing
3516 : : * buffer pool.
3517 : : */
3518 : : void
3519 : 19575 : InitBufferPoolAccess(void)
3520 : : {
3521 : : HASHCTL hash_ctl;
3522 : :
3523 : 19575 : memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3524 : :
3525 : 19575 : hash_ctl.keysize = sizeof(int32);
2975 3526 : 19575 : hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3527 : :
3515 3528 : 19575 : PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3529 : : HASH_ELEM | HASH_BLOBS);
3530 : :
3531 : : /*
3532 : : * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3533 : : * the corresponding phase of backend shutdown.
3534 : : */
983 3535 [ - + ]: 19575 : Assert(MyProc != NULL);
6824 tgl@sss.pgh.pa.us 3536 : 19575 : on_shmem_exit(AtProcExit_Buffers, 0);
3537 : 19575 : }
3538 : :
3539 : : /*
3540 : : * During backend exit, ensure that we released all shared-buffer locks and
3541 : : * assert that we have no remaining pins.
3542 : : */
3543 : : static void
3544 : 18045 : AtProcExit_Buffers(int code, Datum arg)
3545 : : {
7120 3546 : 18045 : UnlockBuffers();
3547 : :
3586 andres@anarazel.de 3548 : 18045 : CheckForBufferLeaks();
3549 : :
3550 : : /* localbuf.c needs a chance too */
3551 : 18045 : AtProcExit_LocalBuffers();
3552 : 18045 : }
3553 : :
3554 : : /*
3555 : : * CheckForBufferLeaks - ensure this backend holds no buffer pins
3556 : : *
3557 : : * As of PostgreSQL 8.0, buffer pins should get released by the
3558 : : * ResourceOwner mechanism. This routine is just a debugging
3559 : : * cross-check that no pins remain.
3560 : : */
3561 : : static void
3562 : 450954 : CheckForBufferLeaks(void)
3563 : : {
3564 : : #ifdef USE_ASSERT_CHECKING
3565 : 450954 : int RefCountErrors = 0;
3566 : : PrivateRefCountEntry *res;
3567 : : int i;
3568 : : char *s;
3569 : :
3570 : : /* check the array */
3515 3571 [ + + ]: 4058586 : for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3572 : : {
3573 : 3607632 : res = &PrivateRefCountArray[i];
3574 : :
3575 [ - + ]: 3607632 : if (res->buffer != InvalidBuffer)
3576 : : {
158 heikki.linnakangas@i 3577 :UNC 0 : s = DebugPrintBufferRefcount(res->buffer);
3578 [ # # ]: 0 : elog(WARNING, "buffer refcount leak: %s", s);
3579 : 0 : pfree(s);
3580 : :
3515 andres@anarazel.de 3581 :UBC 0 : RefCountErrors++;
3582 : : }
3583 : : }
3584 : :
3585 : : /* if necessary search the hash */
3515 andres@anarazel.de 3586 [ - + ]:CBC 450954 : if (PrivateRefCountOverflowed)
3587 : : {
3588 : : HASH_SEQ_STATUS hstat;
3589 : :
3515 andres@anarazel.de 3590 :UBC 0 : hash_seq_init(&hstat, PrivateRefCountHash);
3591 [ # # ]: 0 : while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3592 : : {
158 heikki.linnakangas@i 3593 :UNC 0 : s = DebugPrintBufferRefcount(res->buffer);
3594 [ # # ]: 0 : elog(WARNING, "buffer refcount leak: %s", s);
3595 : 0 : pfree(s);
3586 andres@anarazel.de 3596 :UBC 0 : RefCountErrors++;
3597 : : }
3598 : : }
3599 : :
3586 andres@anarazel.de 3600 [ - + ]:CBC 450954 : Assert(RefCountErrors == 0);
3601 : : #endif
7120 tgl@sss.pgh.pa.us 3602 : 450954 : }
3603 : :
3604 : : /*
3605 : : * Helper routine to issue warnings when a buffer is unexpectedly pinned
3606 : : */
3607 : : char *
158 heikki.linnakangas@i 3608 :UNC 0 : DebugPrintBufferRefcount(Buffer buffer)
3609 : : {
3610 : : BufferDesc *buf;
3611 : : int32 loccount;
3612 : : char *path;
3613 : : char *result;
3614 : : ProcNumber backend;
3615 : : uint32 buf_state;
3616 : :
7120 tgl@sss.pgh.pa.us 3617 [ # # ]:UBC 0 : Assert(BufferIsValid(buffer));
3618 [ # # ]: 0 : if (BufferIsLocal(buffer))
3619 : : {
3363 andres@anarazel.de 3620 : 0 : buf = GetLocalBufferDescriptor(-buffer - 1);
7120 tgl@sss.pgh.pa.us 3621 : 0 : loccount = LocalRefCount[-buffer - 1];
42 heikki.linnakangas@i 3622 :UNC 0 : backend = MyProcNumber;
3623 : : }
3624 : : else
3625 : : {
3363 andres@anarazel.de 3626 :UBC 0 : buf = GetBufferDescriptor(buffer - 1);
3515 3627 : 0 : loccount = GetPrivateRefCount(buffer);
42 heikki.linnakangas@i 3628 :UNC 0 : backend = INVALID_PROC_NUMBER;
3629 : : }
3630 : :
3631 : : /* theoretically we should lock the bufhdr here */
599 rhaas@postgresql.org 3632 :UBC 0 : path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3633 : : BufTagGetForkNum(&buf->tag));
2926 andres@anarazel.de 3634 : 0 : buf_state = pg_atomic_read_u32(&buf->state);
3635 : :
158 heikki.linnakangas@i 3636 :UNC 0 : result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3637 : : buffer, path,
3638 : : buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3639 : : BUF_STATE_GET_REFCOUNT(buf_state), loccount);
5633 heikki.linnakangas@i 3640 :UBC 0 : pfree(path);
158 heikki.linnakangas@i 3641 :UNC 0 : return result;
7227 tgl@sss.pgh.pa.us 3642 :EUB : }
3643 : :
3644 : : /*
3645 : : * CheckPointBuffers
3646 : : *
3647 : : * Flush all dirty blocks in buffer pool to disk at checkpoint time.
3648 : : *
3649 : : * Note: temporary relations do not participate in checkpoints, so they don't
3650 : : * need to be flushed.
3651 : : */
3652 : : void
6135 tgl@sss.pgh.pa.us 3653 :CBC 1153 : CheckPointBuffers(int flags)
3654 : : {
3655 : 1153 : BufferSync(flags);
8536 vadim4o@yahoo.com 3656 : 1148 : }
3657 : :
3658 : : /*
3659 : : * BufferGetBlockNumber
3660 : : * Returns the block number associated with a buffer.
3661 : : *
3662 : : * Note:
3663 : : * Assumes that the buffer is valid and pinned, else the
3664 : : * value may be obsolete immediately...
3665 : : */
3666 : : BlockNumber
10141 scrappy@hub.org 3667 : 80255914 : BufferGetBlockNumber(Buffer buffer)
3668 : : {
3669 : : BufferDesc *bufHdr;
3670 : :
8035 bruce@momjian.us 3671 [ - + + + : 80255914 : Assert(BufferIsPinned(buffer));
- + ]
3672 : :
9716 3673 [ + + ]: 80255914 : if (BufferIsLocal(buffer))
3363 andres@anarazel.de 3674 : 2359292 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3675 : : else
3676 : 77896622 : bufHdr = GetBufferDescriptor(buffer - 1);
3677 : :
3678 : : /* pinned, so OK to read tag without spinlock */
6981 tgl@sss.pgh.pa.us 3679 : 80255914 : return bufHdr->tag.blockNum;
3680 : : }
3681 : :
3682 : : /*
3683 : : * BufferGetTag
3684 : : * Returns the relfilelocator, fork number and block number associated with
3685 : : * a buffer.
3686 : : */
3687 : : void
648 rhaas@postgresql.org 3688 : 13945493 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
3689 : : BlockNumber *blknum)
3690 : : {
3691 : : BufferDesc *bufHdr;
3692 : :
3693 : : /* Do the same checks as BufferGetBlockNumber. */
5725 heikki.linnakangas@i 3694 [ - + - + : 13945493 : Assert(BufferIsPinned(buffer));
- + ]
3695 : :
7300 tgl@sss.pgh.pa.us 3696 [ - + ]: 13945493 : if (BufferIsLocal(buffer))
3363 andres@anarazel.de 3697 :UBC 0 : bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3698 : : else
3363 andres@anarazel.de 3699 :CBC 13945493 : bufHdr = GetBufferDescriptor(buffer - 1);
3700 : :
3701 : : /* pinned, so OK to read tag without spinlock */
599 rhaas@postgresql.org 3702 : 13945493 : *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3703 : 13945493 : *forknum = BufTagGetForkNum(&bufHdr->tag);
5725 heikki.linnakangas@i 3704 : 13945493 : *blknum = bufHdr->tag.blockNum;
7300 tgl@sss.pgh.pa.us 3705 : 13945493 : }
3706 : :
3707 : : /*
3708 : : * FlushBuffer
3709 : : * Physically write out a shared buffer.
3710 : : *
3711 : : * NOTE: this actually just passes the buffer contents to the kernel; the
3712 : : * real write to disk won't happen until the kernel feels like it. This
3713 : : * is okay from our point of view since we can redo the changes from WAL.
3714 : : * However, we will need to force the changes to disk via fsync before
3715 : : * we can checkpoint WAL.
3716 : : *
3717 : : * The caller must hold a pin on the buffer and have share-locked the
3718 : : * buffer contents. (Note: a share-lock does not prevent updates of
3719 : : * hint bits in the buffer, so the page could change while the write
3720 : : * is in progress, but we assume that that will not invalidate the data
3721 : : * written.)
3722 : : *
3723 : : * If the caller has an smgr reference for the buffer's relation, pass it
3724 : : * as the second parameter. If not, pass NULL.
3725 : : */
3726 : : static void
430 andres@anarazel.de 3727 : 530338 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
3728 : : IOContext io_context)
3729 : : {
3730 : : XLogRecPtr recptr;
3731 : : ErrorContextCallback errcallback;
3732 : : instr_time io_start;
3733 : : Block bufBlock;
3734 : : char *bufToWrite;
3735 : : uint32 buf_state;
3736 : :
3737 : : /*
3738 : : * Try to start an I/O operation. If StartBufferIO returns false, then
3739 : : * someone else flushed the buffer before we could, so we need not do
3740 : : * anything.
3741 : : */
11 tmunro@postgresql.or 3742 [ - + ]:GNC 530338 : if (!StartBufferIO(buf, false, false))
6981 tgl@sss.pgh.pa.us 3743 :UBC 0 : return;
3744 : :
3745 : : /* Setup error traceback support for ereport() */
4171 heikki.linnakangas@i 3746 :CBC 530338 : errcallback.callback = shared_buffer_write_error_callback;
3747 : 530338 : errcallback.arg = (void *) buf;
3748 : 530338 : errcallback.previous = error_context_stack;
3749 : 530338 : error_context_stack = &errcallback;
3750 : :
3751 : : /* Find smgr relation for buffer */
7298 tgl@sss.pgh.pa.us 3752 [ + + ]: 530338 : if (reln == NULL)
42 heikki.linnakangas@i 3753 :GNC 526808 : reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
3754 : :
3755 : : TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3756 : : buf->tag.blockNum,
3757 : : reln->smgr_rlocator.locator.spcOid,
3758 : : reln->smgr_rlocator.locator.dbOid,
3759 : : reln->smgr_rlocator.locator.relNumber);
3760 : :
2926 andres@anarazel.de 3761 :CBC 530338 : buf_state = LockBufHdr(buf);
3762 : :
3763 : : /*
3764 : : * Run PageGetLSN while holding header lock, since we don't have the
3765 : : * buffer locked exclusively in all cases.
3766 : : */
4041 simon@2ndQuadrant.co 3767 : 530338 : recptr = BufferGetLSN(buf);
3768 : :
3769 : : /* To check if block content changes while flushing. - vadim 01/17/97 */
2926 andres@anarazel.de 3770 : 530338 : buf_state &= ~BM_JUST_DIRTIED;
3771 : 530338 : UnlockBufHdr(buf, buf_state);
3772 : :
3773 : : /*
3774 : : * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3775 : : * rule that log updates must hit disk before any of the data-file changes
3776 : : * they describe do.
3777 : : *
3778 : : * However, this rule does not apply to unlogged relations, which will be
3779 : : * lost after a crash anyway. Most unlogged relation pages do not bear
3780 : : * LSNs since we never emit WAL records for them, and therefore flushing
3781 : : * up through the buffer LSN would be useless, but harmless. However,
3782 : : * GiST indexes use LSNs internally to track page-splits, and therefore
3783 : : * unlogged GiST pages bear "fake" LSNs generated by
3784 : : * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3785 : : * LSN counter could advance past the WAL insertion point; and if it did
3786 : : * happen, attempting to flush WAL through that location would fail, with
3787 : : * disastrous system-wide consequences. To make sure that can't happen,
3788 : : * skip the flush if the buffer isn't permanent.
3789 : : */
3790 [ + + ]: 530338 : if (buf_state & BM_PERMANENT)
4080 heikki.linnakangas@i 3791 : 528285 : XLogFlush(recptr);
3792 : :
3793 : : /*
3794 : : * Now it's safe to write buffer to disk. Note that no one else should
3795 : : * have been able to write it while we were busy with log flushing because
3796 : : * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3797 : : */
4041 simon@2ndQuadrant.co 3798 : 530338 : bufBlock = BufHdrGetBlock(buf);
3799 : :
3800 : : /*
3801 : : * Update page checksum if desired. Since we have only shared lock on the
3802 : : * buffer, other processes might be updating hint bits in it, so we must
3803 : : * copy the page to private storage if we do checksumming.
3804 : : */
3805 : 530338 : bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3806 : :
120 michael@paquier.xyz 3807 :GNC 530338 : io_start = pgstat_prepare_io_time(track_io_timing);
3808 : :
3809 : : /*
3810 : : * bufToWrite is either the shared buffer or a copy, as appropriate.
3811 : : */
7369 tgl@sss.pgh.pa.us 3812 :CBC 530338 : smgrwrite(reln,
599 rhaas@postgresql.org 3813 : 530338 : BufTagGetForkNum(&buf->tag),
3814 : : buf->tag.blockNum,
3815 : : bufToWrite,
3816 : : false);
3817 : :
3818 : : /*
3819 : : * When a strategy is in use, only flushes of dirty buffers already in the
3820 : : * strategy ring are counted as strategy writes (IOCONTEXT
3821 : : * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3822 : : * statistics tracking.
3823 : : *
3824 : : * If a shared buffer initially added to the ring must be flushed before
3825 : : * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3826 : : *
3827 : : * If a shared buffer which was added to the ring later because the
3828 : : * current strategy buffer is pinned or in use or because all strategy
3829 : : * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3830 : : * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3831 : : * (from_ring will be false).
3832 : : *
3833 : : * When a strategy is not in use, the write can only be a "regular" write
3834 : : * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3835 : : */
373 andres@anarazel.de 3836 : 530338 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
3837 : : IOOP_WRITE, io_start, 1);
3838 : :
5234 rhaas@postgresql.org 3839 : 530338 : pgBufferUsage.shared_blks_written++;
3840 : :
3841 : : /*
3842 : : * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3843 : : * end the BM_IO_IN_PROGRESS state.
3844 : : */
158 heikki.linnakangas@i 3845 :GNC 530338 : TerminateBufferIO(buf, true, 0, true);
3846 : :
3847 : : TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3848 : : buf->tag.blockNum,
3849 : : reln->smgr_rlocator.locator.spcOid,
3850 : : reln->smgr_rlocator.locator.dbOid,
3851 : : reln->smgr_rlocator.locator.relNumber);
3852 : :
3853 : : /* Pop the error context stack */
4171 heikki.linnakangas@i 3854 :CBC 530338 : error_context_stack = errcallback.previous;
3855 : : }
3856 : :
3857 : : /*
3858 : : * RelationGetNumberOfBlocksInFork
3859 : : * Determines the current number of pages in the specified relation fork.
3860 : : *
3861 : : * Note that the accuracy of the result will depend on the details of the
3862 : : * relation's storage. For builtin AMs it'll be accurate, but for external AMs
3863 : : * it might not be.
3864 : : */
3865 : : BlockNumber
4855 rhaas@postgresql.org 3866 : 1537475 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
3867 : : {
863 peter@eisentraut.org 3868 [ + + + + : 1537475 : if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
+ + ]
3869 : : {
3870 : : /*
3871 : : * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
3872 : : * tableam returns the size in bytes - but for the purpose of this
3873 : : * routine, we want the number of blocks. Therefore divide, rounding
3874 : : * up.
3875 : : */
3876 : : uint64 szbytes;
3877 : :
3878 : 1161522 : szbytes = table_relation_size(relation, forkNum);
3879 : :
3880 : 1161503 : return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
3881 : : }
3882 [ + - + + : 375953 : else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
- + - - -
- ]
3883 : : {
703 tgl@sss.pgh.pa.us 3884 : 375953 : return smgrnblocks(RelationGetSmgr(relation), forkNum);
3885 : : }
3886 : : else
863 peter@eisentraut.org 3887 :UBC 0 : Assert(false);
3888 : :
3889 : : return 0; /* keep compiler quiet */
3890 : : }
3891 : :
3892 : : /*
3893 : : * BufferIsPermanent
3894 : : * Determines whether a buffer will potentially still be around after
3895 : : * a crash. Caller must hold a buffer pin.
3896 : : */
3897 : : bool
4552 rhaas@postgresql.org 3898 :CBC 8944186 : BufferIsPermanent(Buffer buffer)
3899 : : {
3900 : : BufferDesc *bufHdr;
3901 : :
3902 : : /* Local buffers are used only for temp relations. */
3903 [ + + ]: 8944186 : if (BufferIsLocal(buffer))
3904 : 672546 : return false;
3905 : :
3906 : : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3907 [ - + ]: 8271640 : Assert(BufferIsValid(buffer));
3908 [ - + - + : 8271640 : Assert(BufferIsPinned(buffer));
- + ]
3909 : :
3910 : : /*
3911 : : * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3912 : : * need not bother with the buffer header spinlock. Even if someone else
3913 : : * changes the buffer header state while we're doing this, the state is
3914 : : * changed atomically, so we'll read the old value or the new value, but
3915 : : * not random garbage.
3916 : : */
3363 andres@anarazel.de 3917 : 8271640 : bufHdr = GetBufferDescriptor(buffer - 1);
2926 3918 : 8271640 : return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3919 : : }
3920 : :
3921 : : /*
3922 : : * BufferGetLSNAtomic
3923 : : * Retrieves the LSN of the buffer atomically using a buffer header lock.
3924 : : * This is necessary for some callers who may not have an exclusive lock
3925 : : * on the buffer.
3926 : : */
3927 : : XLogRecPtr
4041 simon@2ndQuadrant.co 3928 : 7592142 : BufferGetLSNAtomic(Buffer buffer)
3929 : : {
3072 rhaas@postgresql.org 3930 : 7592142 : BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
2916 kgrittn@postgresql.o 3931 : 7592142 : char *page = BufferGetPage(buffer);
3932 : : XLogRecPtr lsn;
3933 : : uint32 buf_state;
3934 : :
3935 : : /*
3936 : : * If we don't need locking for correctness, fastpath out.
3937 : : */
3215 heikki.linnakangas@i 3938 [ + + + + : 7592142 : if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
+ + ]
4041 simon@2ndQuadrant.co 3939 : 5889490 : return PageGetLSN(page);
3940 : :
3941 : : /* Make sure we've got a real buffer, and that we hold a pin on it. */
3942 [ - + ]: 1702652 : Assert(BufferIsValid(buffer));
3943 [ - + - + : 1702652 : Assert(BufferIsPinned(buffer));
- + ]
3944 : :
2926 andres@anarazel.de 3945 : 1702652 : buf_state = LockBufHdr(bufHdr);
4041 simon@2ndQuadrant.co 3946 : 1702652 : lsn = PageGetLSN(page);
2926 andres@anarazel.de 3947 : 1702652 : UnlockBufHdr(bufHdr, buf_state);
3948 : :
4041 simon@2ndQuadrant.co 3949 : 1702652 : return lsn;
3950 : : }
3951 : :
3952 : : /* ---------------------------------------------------------------------
3953 : : * DropRelationBuffers
3954 : : *
3955 : : * This function removes from the buffer pool all the pages of the
3956 : : * specified relation forks that have block numbers >= firstDelBlock.
3957 : : * (In particular, with firstDelBlock = 0, all pages are removed.)
3958 : : * Dirty pages are simply dropped, without bothering to write them
3959 : : * out first. Therefore, this is NOT rollback-able, and so should be
3960 : : * used only with extreme caution!
3961 : : *
3962 : : * Currently, this is called only from smgr.c when the underlying file
3963 : : * is about to be deleted or truncated (firstDelBlock is needed for
3964 : : * the truncation case). The data in the affected pages would therefore
3965 : : * be deleted momentarily anyway, and there is no point in writing it.
3966 : : * It is the responsibility of higher-level code to ensure that the
3967 : : * deletion or truncation does not lose any data that could be needed
3968 : : * later. It is also the responsibility of higher-level code to ensure
3969 : : * that no other process could be trying to load more pages of the
3970 : : * relation into buffers.
3971 : : * --------------------------------------------------------------------
3972 : : */
3973 : : void
642 rhaas@postgresql.org 3974 : 578 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
3975 : : int nforks, BlockNumber *firstDelBlock)
3976 : : {
3977 : : int i;
3978 : : int j;
3979 : : RelFileLocatorBackend rlocator;
3980 : : BlockNumber nForkBlock[MAX_FORKNUM];
1068 tgl@sss.pgh.pa.us 3981 : 578 : uint64 nBlocksToInvalidate = 0;
3982 : :
648 rhaas@postgresql.org 3983 : 578 : rlocator = smgr_reln->smgr_rlocator;
3984 : :
3985 : : /* If it's a local relation, it's localbuf.c's problem. */
3986 [ + + ]: 578 : if (RelFileLocatorBackendIsTemp(rlocator))
3987 : : {
42 heikki.linnakangas@i 3988 [ + - ]:GNC 329 : if (rlocator.backend == MyProcNumber)
3989 : : {
1664 fujii@postgresql.org 3990 [ + + ]:CBC 675 : for (j = 0; j < nforks; j++)
642 rhaas@postgresql.org 3991 : 346 : DropRelationLocalBuffers(rlocator.locator, forkNum[j],
3992 : 346 : firstDelBlock[j]);
3993 : : }
7922 tgl@sss.pgh.pa.us 3994 : 371 : return;
3995 : : }
3996 : :
3997 : : /*
3998 : : * To remove all the pages of the specified relation forks from the buffer
3999 : : * pool, we need to scan the entire buffer pool but we can optimize it by
4000 : : * finding the buffers from BufMapping table provided we know the exact
4001 : : * size of each fork of the relation. The exact size is required to ensure
4002 : : * that we don't leave any buffer for the relation being dropped as
4003 : : * otherwise the background writer or checkpointer can lead to a PANIC
4004 : : * error while flushing buffers corresponding to files that don't exist.
4005 : : *
4006 : : * To know the exact size, we rely on the size cached for each fork by us
4007 : : * during recovery which limits the optimization to recovery and on
4008 : : * standbys but we can easily extend it once we have shared cache for
4009 : : * relation size.
4010 : : *
4011 : : * In recovery, we cache the value returned by the first lseek(SEEK_END)
4012 : : * and the future writes keeps the cached value up-to-date. See
4013 : : * smgrextend. It is possible that the value of the first lseek is smaller
4014 : : * than the actual number of existing blocks in the file due to buggy
4015 : : * Linux kernels that might not have accounted for the recent write. But
4016 : : * that should be fine because there must not be any buffers after that
4017 : : * file size.
4018 : : */
1188 akapila@postgresql.o 4019 [ + + ]: 347 : for (i = 0; i < nforks; i++)
4020 : : {
4021 : : /* Get the number of blocks for a relation's fork */
4022 : 297 : nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4023 : :
4024 [ + + ]: 297 : if (nForkBlock[i] == InvalidBlockNumber)
4025 : : {
4026 : 199 : nBlocksToInvalidate = InvalidBlockNumber;
4027 : 199 : break;
4028 : : }
4029 : :
4030 : : /* calculate the number of blocks to be invalidated */
4031 : 98 : nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4032 : : }
4033 : :
4034 : : /*
4035 : : * We apply the optimization iff the total number of blocks to invalidate
4036 : : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4037 : : */
4038 [ + + ]: 249 : if (BlockNumberIsValid(nBlocksToInvalidate) &&
4039 [ + + ]: 50 : nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4040 : : {
4041 [ + + ]: 121 : for (j = 0; j < nforks; j++)
642 rhaas@postgresql.org 4042 : 79 : FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4043 : 79 : nForkBlock[j], firstDelBlock[j]);
1188 akapila@postgresql.o 4044 : 42 : return;
4045 : : }
4046 : :
6981 tgl@sss.pgh.pa.us 4047 [ + + ]: 2692687 : for (i = 0; i < NBuffers; i++)
4048 : : {
3072 rhaas@postgresql.org 4049 : 2692480 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4050 : : uint32 buf_state;
4051 : :
4052 : : /*
4053 : : * We can make this a tad faster by prechecking the buffer tag before
4054 : : * we attempt to lock the buffer; this saves a lot of lock
4055 : : * acquisitions in typical cases. It should be safe because the
4056 : : * caller must have AccessExclusiveLock on the relation, or some other
4057 : : * reason to be certain that no one is loading new pages of the rel
4058 : : * into the buffer pool. (Otherwise we might well miss such pages
4059 : : * entirely.) Therefore, while the tag might be changing while we
4060 : : * look at it, it can't be changing *to* a value we care about, only
4061 : : * *away* from such a value. So false negatives are impossible, and
4062 : : * false positives are safe because we'll recheck after getting the
4063 : : * buffer lock.
4064 : : *
4065 : : * We could check forkNum and blockNum as well as the rlocator, but
4066 : : * the incremental win from doing so seems small.
4067 : : */
599 4068 [ + + ]: 2692480 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4329 tgl@sss.pgh.pa.us 4069 : 2683660 : continue;
4070 : :
2926 andres@anarazel.de 4071 : 8820 : buf_state = LockBufHdr(bufHdr);
4072 : :
1664 fujii@postgresql.org 4073 [ + + ]: 21892 : for (j = 0; j < nforks; j++)
4074 : : {
599 rhaas@postgresql.org 4075 [ + - ]: 15493 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4076 [ + + ]: 15493 : BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
1664 fujii@postgresql.org 4077 [ + + ]: 8709 : bufHdr->tag.blockNum >= firstDelBlock[j])
4078 : : {
1431 tgl@sss.pgh.pa.us 4079 : 2421 : InvalidateBuffer(bufHdr); /* releases spinlock */
1664 fujii@postgresql.org 4080 : 2421 : break;
4081 : : }
4082 : : }
4083 [ + + ]: 8820 : if (j >= nforks)
2926 andres@anarazel.de 4084 : 6399 : UnlockBufHdr(bufHdr, buf_state);
4085 : : }
4086 : : }
4087 : :
4088 : : /* ---------------------------------------------------------------------
4089 : : * DropRelationsAllBuffers
4090 : : *
4091 : : * This function removes from the buffer pool all the pages of all
4092 : : * forks of the specified relations. It's equivalent to calling
4093 : : * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
4094 : : * --------------------------------------------------------------------
4095 : : */
4096 : : void
642 rhaas@postgresql.org 4097 : 12109 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
4098 : : {
4099 : : int i;
1187 akapila@postgresql.o 4100 : 12109 : int n = 0;
4101 : : SMgrRelation *rels;
4102 : : BlockNumber (*block)[MAX_FORKNUM + 1];
1068 tgl@sss.pgh.pa.us 4103 : 12109 : uint64 nBlocksToInvalidate = 0;
4104 : : RelFileLocator *locators;
1187 akapila@postgresql.o 4105 : 12109 : bool cached = true;
4106 : : bool use_bsearch;
4107 : :
648 rhaas@postgresql.org 4108 [ - + ]: 12109 : if (nlocators == 0)
4105 alvherre@alvh.no-ip. 4109 :UBC 0 : return;
4110 : :
648 rhaas@postgresql.org 4111 :CBC 12109 : rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4112 : :
4113 : : /* If it's a local relation, it's localbuf.c's problem. */
4114 [ + + ]: 54590 : for (i = 0; i < nlocators; i++)
4115 : : {
4116 [ + + ]: 42481 : if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4117 : : {
42 heikki.linnakangas@i 4118 [ + - ]:GNC 2903 : if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
642 rhaas@postgresql.org 4119 :CBC 2903 : DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4120 : : }
4121 : : else
1187 akapila@postgresql.o 4122 : 39578 : rels[n++] = smgr_reln[i];
4123 : : }
4124 : :
4125 : : /*
4126 : : * If there are no non-local relations, then we're done. Release the
4127 : : * memory and return.
4128 : : */
4105 alvherre@alvh.no-ip. 4129 [ + + ]: 12109 : if (n == 0)
4130 : : {
1187 akapila@postgresql.o 4131 : 733 : pfree(rels);
4329 tgl@sss.pgh.pa.us 4132 : 733 : return;
4133 : : }
4134 : :
4135 : : /*
4136 : : * This is used to remember the number of blocks for all the relations
4137 : : * forks.
4138 : : */
4139 : : block = (BlockNumber (*)[MAX_FORKNUM + 1])
1187 akapila@postgresql.o 4140 : 11376 : palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4141 : :
4142 : : /*
4143 : : * We can avoid scanning the entire buffer pool if we know the exact size
4144 : : * of each of the given relation forks. See DropRelationBuffers.
4145 : : */
4146 [ + + + + ]: 24113 : for (i = 0; i < n && cached; i++)
4147 : : {
599 drowley@postgresql.o 4148 [ + + ]: 21393 : for (int j = 0; j <= MAX_FORKNUM; j++)
4149 : : {
4150 : : /* Get the number of blocks for a relation's fork. */
1187 akapila@postgresql.o 4151 : 19240 : block[i][j] = smgrnblocks_cached(rels[i], j);
4152 : :
4153 : : /* We need to only consider the relation forks that exists. */
4154 [ + + ]: 19240 : if (block[i][j] == InvalidBlockNumber)
4155 : : {
4156 [ + + ]: 16907 : if (!smgrexists(rels[i], j))
4157 : 6323 : continue;
4158 : 10584 : cached = false;
4159 : 10584 : break;
4160 : : }
4161 : :
4162 : : /* calculate the total number of blocks to be invalidated */
4163 : 2333 : nBlocksToInvalidate += block[i][j];
4164 : : }
4165 : : }
4166 : :
4167 : : /*
4168 : : * We apply the optimization iff the total number of blocks to invalidate
4169 : : * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4170 : : */
4171 [ + + + + ]: 11376 : if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4172 : : {
4173 [ + + ]: 1268 : for (i = 0; i < n; i++)
4174 : : {
599 drowley@postgresql.o 4175 [ + + ]: 3510 : for (int j = 0; j <= MAX_FORKNUM; j++)
4176 : : {
4177 : : /* ignore relation forks that doesn't exist */
1187 akapila@postgresql.o 4178 [ + + ]: 2808 : if (!BlockNumberIsValid(block[i][j]))
4179 : 2095 : continue;
4180 : :
4181 : : /* drop all the buffers for a particular relation fork */
642 rhaas@postgresql.org 4182 : 713 : FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4183 : 713 : j, block[i][j], 0);
4184 : : }
4185 : : }
4186 : :
1187 akapila@postgresql.o 4187 : 566 : pfree(block);
4188 : 566 : pfree(rels);
4189 : 566 : return;
4190 : : }
4191 : :
4192 : 10810 : pfree(block);
648 rhaas@postgresql.org 4193 : 10810 : locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
1187 akapila@postgresql.o 4194 [ + + ]: 49686 : for (i = 0; i < n; i++)
648 rhaas@postgresql.org 4195 : 38876 : locators[i] = rels[i]->smgr_rlocator.locator;
4196 : :
4197 : : /*
4198 : : * For low number of relations to drop just use a simple walk through, to
4199 : : * save the bsearch overhead. The threshold to use is rather a guess than
4200 : : * an exactly determined value, as it depends on many factors (CPU and RAM
4201 : : * speeds, amount of shared buffers etc.).
4202 : : */
1471 noah@leadboat.com 4203 : 10810 : use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4204 : :
4205 : : /* sort the list of rlocators if necessary */
4105 alvherre@alvh.no-ip. 4206 [ + + ]: 10810 : if (use_bsearch)
58 nathan@postgresql.or 4207 :GNC 165 : qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4208 : :
4329 tgl@sss.pgh.pa.us 4209 [ + + ]:CBC 118047546 : for (i = 0; i < NBuffers; i++)
4210 : : {
648 rhaas@postgresql.org 4211 : 118036736 : RelFileLocator *rlocator = NULL;
3072 4212 : 118036736 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4213 : : uint32 buf_state;
4214 : :
4215 : : /*
4216 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4217 : : * saves some cycles.
4218 : : */
4219 : :
4105 alvherre@alvh.no-ip. 4220 [ + + ]: 118036736 : if (!use_bsearch)
4221 : : {
4222 : : int j;
4223 : :
4224 [ + + ]: 483404685 : for (j = 0; j < n; j++)
4225 : : {
599 rhaas@postgresql.org 4226 [ + + ]: 367176016 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4227 : : {
648 4228 : 80067 : rlocator = &locators[j];
4105 alvherre@alvh.no-ip. 4229 : 80067 : break;
4230 : : }
4231 : : }
4232 : : }
4233 : : else
4234 : : {
4235 : : RelFileLocator locator;
4236 : :
599 rhaas@postgresql.org 4237 : 1728000 : locator = BufTagGetRelFileLocator(&bufHdr->tag);
4238 : 1728000 : rlocator = bsearch((const void *) &(locator),
4239 : : locators, n, sizeof(RelFileLocator),
4240 : : rlocator_comparator);
4241 : : }
4242 : :
4243 : : /* buffer doesn't belong to any of the given relfilelocators; skip it */
648 4244 [ + + ]: 118036736 : if (rlocator == NULL)
4329 tgl@sss.pgh.pa.us 4245 : 117955010 : continue;
4246 : :
2926 andres@anarazel.de 4247 : 81726 : buf_state = LockBufHdr(bufHdr);
599 rhaas@postgresql.org 4248 [ + - ]: 81726 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4329 tgl@sss.pgh.pa.us 4249 : 81726 : InvalidateBuffer(bufHdr); /* releases spinlock */
4250 : : else
2926 andres@anarazel.de 4251 :UBC 0 : UnlockBufHdr(bufHdr, buf_state);
4252 : : }
4253 : :
648 rhaas@postgresql.org 4254 :CBC 10810 : pfree(locators);
1187 akapila@postgresql.o 4255 : 10810 : pfree(rels);
4256 : : }
4257 : :
4258 : : /* ---------------------------------------------------------------------
4259 : : * FindAndDropRelationBuffers
4260 : : *
4261 : : * This function performs look up in BufMapping table and removes from the
4262 : : * buffer pool all the pages of the specified relation fork that has block
4263 : : * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
4264 : : * pages are removed.)
4265 : : * --------------------------------------------------------------------
4266 : : */
4267 : : static void
642 rhaas@postgresql.org 4268 : 792 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
4269 : : BlockNumber nForkBlock,
4270 : : BlockNumber firstDelBlock)
4271 : : {
4272 : : BlockNumber curBlock;
4273 : :
1188 akapila@postgresql.o 4274 [ + + ]: 1922 : for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4275 : : {
4276 : : uint32 bufHash; /* hash value for tag */
4277 : : BufferTag bufTag; /* identity of requested block */
4278 : : LWLock *bufPartitionLock; /* buffer partition lock for it */
4279 : : int buf_id;
4280 : : BufferDesc *bufHdr;
4281 : : uint32 buf_state;
4282 : :
4283 : : /* create a tag so we can lookup the buffer */
627 rhaas@postgresql.org 4284 : 1130 : InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4285 : :
4286 : : /* determine its hash code and partition lock ID */
1188 akapila@postgresql.o 4287 : 1130 : bufHash = BufTableHashCode(&bufTag);
4288 : 1130 : bufPartitionLock = BufMappingPartitionLock(bufHash);
4289 : :
4290 : : /* Check that it is in the buffer pool. If not, do nothing. */
4291 : 1130 : LWLockAcquire(bufPartitionLock, LW_SHARED);
4292 : 1130 : buf_id = BufTableLookup(&bufTag, bufHash);
4293 : 1130 : LWLockRelease(bufPartitionLock);
4294 : :
4295 [ + + ]: 1130 : if (buf_id < 0)
4296 : 151 : continue;
4297 : :
4298 : 979 : bufHdr = GetBufferDescriptor(buf_id);
4299 : :
4300 : : /*
4301 : : * We need to lock the buffer header and recheck if the buffer is
4302 : : * still associated with the same block because the buffer could be
4303 : : * evicted by some other backend loading blocks for a different
4304 : : * relation after we release lock on the BufMapping table.
4305 : : */
4306 : 979 : buf_state = LockBufHdr(bufHdr);
4307 : :
599 rhaas@postgresql.org 4308 [ + - + - ]: 1958 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4309 : 979 : BufTagGetForkNum(&bufHdr->tag) == forkNum &&
1188 akapila@postgresql.o 4310 [ + - ]: 979 : bufHdr->tag.blockNum >= firstDelBlock)
4311 : 979 : InvalidateBuffer(bufHdr); /* releases spinlock */
4312 : : else
1188 akapila@postgresql.o 4313 :UBC 0 : UnlockBufHdr(bufHdr, buf_state);
4314 : : }
1188 akapila@postgresql.o 4315 :CBC 792 : }
4316 : :
4317 : : /* ---------------------------------------------------------------------
4318 : : * DropDatabaseBuffers
4319 : : *
4320 : : * This function removes all the buffers in the buffer cache for a
4321 : : * particular database. Dirty pages are simply dropped, without
4322 : : * bothering to write them out first. This is used when we destroy a
4323 : : * database, to avoid trying to flush data to disk when the directory
4324 : : * tree no longer exists. Implementation is pretty similar to
4325 : : * DropRelationBuffers() which is for destroying just one relation.
4326 : : * --------------------------------------------------------------------
4327 : : */
4328 : : void
6591 tgl@sss.pgh.pa.us 4329 : 64 : DropDatabaseBuffers(Oid dbid)
4330 : : {
4331 : : int i;
4332 : :
4333 : : /*
4334 : : * We needn't consider local buffers, since by assumption the target
4335 : : * database isn't our own.
4336 : : */
4337 : :
6981 4338 [ + + ]: 203328 : for (i = 0; i < NBuffers; i++)
4339 : : {
3072 rhaas@postgresql.org 4340 : 203264 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4341 : : uint32 buf_state;
4342 : :
4343 : : /*
4344 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4345 : : * saves some cycles.
4346 : : */
599 4347 [ + + ]: 203264 : if (bufHdr->tag.dbOid != dbid)
4329 tgl@sss.pgh.pa.us 4348 : 194782 : continue;
4349 : :
2926 andres@anarazel.de 4350 : 8482 : buf_state = LockBufHdr(bufHdr);
599 rhaas@postgresql.org 4351 [ + - ]: 8482 : if (bufHdr->tag.dbOid == dbid)
6756 bruce@momjian.us 4352 : 8482 : InvalidateBuffer(bufHdr); /* releases spinlock */
4353 : : else
2926 andres@anarazel.de 4354 :UBC 0 : UnlockBufHdr(bufHdr, buf_state);
4355 : : }
10141 scrappy@hub.org 4356 :CBC 64 : }
4357 : :
4358 : : /* -----------------------------------------------------------------
4359 : : * PrintBufferDescs
4360 : : *
4361 : : * this function prints all the buffer descriptors, for debugging
4362 : : * use only.
4363 : : * -----------------------------------------------------------------
4364 : : */
4365 : : #ifdef NOT_USED
4366 : : void
4367 : : PrintBufferDescs(void)
4368 : : {
4369 : : int i;
4370 : :
4371 : : for (i = 0; i < NBuffers; ++i)
4372 : : {
4373 : : BufferDesc *buf = GetBufferDescriptor(i);
4374 : : Buffer b = BufferDescriptorGetBuffer(buf);
4375 : :
4376 : : /* theoretically we should lock the bufhdr here */
4377 : : elog(LOG,
4378 : : "[%02d] (freeNext=%d, rel=%s, "
4379 : : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4380 : : i, buf->freeNext,
4381 : : relpathbackend(BufTagGetRelFileLocator(&buf->tag),
4382 : : INVALID_PROC_NUMBER, BufTagGetForkNum(&buf->tag)),
4383 : : buf->tag.blockNum, buf->flags,
4384 : : buf->refcount, GetPrivateRefCount(b));
4385 : : }
4386 : : }
4387 : : #endif
4388 : :
4389 : : #ifdef NOT_USED
4390 : : void
4391 : : PrintPinnedBufs(void)
4392 : : {
4393 : : int i;
4394 : :
4395 : : for (i = 0; i < NBuffers; ++i)
4396 : : {
4397 : : BufferDesc *buf = GetBufferDescriptor(i);
4398 : : Buffer b = BufferDescriptorGetBuffer(buf);
4399 : :
4400 : : if (GetPrivateRefCount(b) > 0)
4401 : : {
4402 : : /* theoretically we should lock the bufhdr here */
4403 : : elog(LOG,
4404 : : "[%02d] (freeNext=%d, rel=%s, "
4405 : : "blockNum=%u, flags=0x%x, refcount=%u %d)",
4406 : : i, buf->freeNext,
4407 : : relpathperm(BufTagGetRelFileLocator(&buf->tag),
4408 : : BufTagGetForkNum(&buf->tag)),
4409 : : buf->tag.blockNum, buf->flags,
4410 : : buf->refcount, GetPrivateRefCount(b));
4411 : : }
4412 : : }
4413 : : }
4414 : : #endif
4415 : :
4416 : : /* ---------------------------------------------------------------------
4417 : : * FlushRelationBuffers
4418 : : *
4419 : : * This function writes all dirty pages of a relation out to disk
4420 : : * (or more accurately, out to kernel disk buffers), ensuring that the
4421 : : * kernel has an up-to-date view of the relation.
4422 : : *
4423 : : * Generally, the caller should be holding AccessExclusiveLock on the
4424 : : * target relation to ensure that no other backend is busy dirtying
4425 : : * more blocks of the relation; the effects can't be expected to last
4426 : : * after the lock is released.
4427 : : *
4428 : : * XXX currently it sequentially searches the buffer pool, should be
4429 : : * changed to more clever ways of searching. This routine is not
4430 : : * used in any performance-critical code paths, so it's not worth
4431 : : * adding additional overhead to normal paths to make it go faster.
4432 : : * --------------------------------------------------------------------
4433 : : */
4434 : : void
6965 tgl@sss.pgh.pa.us 4435 : 130 : FlushRelationBuffers(Relation rel)
4436 : : {
4437 : : int i;
4438 : : BufferDesc *bufHdr;
74 heikki.linnakangas@i 4439 :GNC 130 : SMgrRelation srel = RelationGetSmgr(rel);
4440 : :
4871 rhaas@postgresql.org 4441 [ + + ]:CBC 130 : if (RelationUsesLocalBuffers(rel))
4442 : : {
9701 vadim4o@yahoo.com 4443 [ + + ]: 909 : for (i = 0; i < NLocBuffer; i++)
4444 : : {
4445 : : uint32 buf_state;
4446 : : instr_time io_start;
4447 : :
3363 andres@anarazel.de 4448 : 900 : bufHdr = GetLocalBufferDescriptor(i);
599 rhaas@postgresql.org 4449 [ + + ]: 900 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
2926 andres@anarazel.de 4450 [ + + ]: 300 : ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4451 : : (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4452 : : {
4453 : : ErrorContextCallback errcallback;
4454 : : Page localpage;
4455 : :
4041 simon@2ndQuadrant.co 4456 : 297 : localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4457 : :
4458 : : /* Setup error traceback support for ereport() */
4171 heikki.linnakangas@i 4459 : 297 : errcallback.callback = local_buffer_write_error_callback;
4460 : 297 : errcallback.arg = (void *) bufHdr;
4461 : 297 : errcallback.previous = error_context_stack;
4462 : 297 : error_context_stack = &errcallback;
4463 : :
4041 simon@2ndQuadrant.co 4464 : 297 : PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4465 : :
120 michael@paquier.xyz 4466 :GNC 297 : io_start = pgstat_prepare_io_time(track_io_timing);
4467 : :
74 heikki.linnakangas@i 4468 : 297 : smgrwrite(srel,
599 rhaas@postgresql.org 4469 :CBC 297 : BufTagGetForkNum(&bufHdr->tag),
4470 : : bufHdr->tag.blockNum,
4471 : : localpage,
4472 : : false);
4473 : :
373 andres@anarazel.de 4474 : 297 : pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
4475 : : IOCONTEXT_NORMAL, IOOP_WRITE,
4476 : : io_start, 1);
4477 : :
2926 4478 : 297 : buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
2746 4479 : 297 : pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4480 : :
373 4481 : 297 : pgBufferUsage.local_blks_written++;
4482 : :
4483 : : /* Pop the error context stack */
4171 heikki.linnakangas@i 4484 : 297 : error_context_stack = errcallback.previous;
4485 : : }
4486 : : }
4487 : :
7258 tgl@sss.pgh.pa.us 4488 : 9 : return;
4489 : : }
4490 : :
9701 vadim4o@yahoo.com 4491 [ + + ]: 1429881 : for (i = 0; i < NBuffers; i++)
4492 : : {
4493 : : uint32 buf_state;
4494 : :
3363 andres@anarazel.de 4495 : 1429760 : bufHdr = GetBufferDescriptor(i);
4496 : :
4497 : : /*
4498 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4499 : : * saves some cycles.
4500 : : */
599 rhaas@postgresql.org 4501 [ + + ]: 1429760 : if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4329 tgl@sss.pgh.pa.us 4502 : 1429555 : continue;
4503 : :
4504 : : /* Make sure we can handle the pin */
3373 andres@anarazel.de 4505 : 205 : ReservePrivateRefCountEntry();
158 heikki.linnakangas@i 4506 :GNC 205 : ResourceOwnerEnlarge(CurrentResourceOwner);
4507 : :
2926 andres@anarazel.de 4508 :CBC 205 : buf_state = LockBufHdr(bufHdr);
599 rhaas@postgresql.org 4509 [ + - ]: 205 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
2926 andres@anarazel.de 4510 [ + + ]: 205 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4511 : : {
6965 tgl@sss.pgh.pa.us 4512 : 163 : PinBuffer_Locked(bufHdr);
3043 rhaas@postgresql.org 4513 : 163 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
74 heikki.linnakangas@i 4514 :GNC 163 : FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3043 rhaas@postgresql.org 4515 :CBC 163 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
562 michael@paquier.xyz 4516 : 163 : UnpinBuffer(bufHdr);
4517 : : }
4518 : : else
2926 andres@anarazel.de 4519 : 42 : UnlockBufHdr(bufHdr, buf_state);
4520 : : }
4521 : : }
4522 : :
4523 : : /* ---------------------------------------------------------------------
4524 : : * FlushRelationsAllBuffers
4525 : : *
4526 : : * This function flushes out of the buffer pool all the pages of all
4527 : : * forks of the specified smgr relations. It's equivalent to calling
4528 : : * FlushRelationBuffers once per relation. The relations are assumed not
4529 : : * to use local buffers.
4530 : : * --------------------------------------------------------------------
4531 : : */
4532 : : void
1471 noah@leadboat.com 4533 : 9 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
4534 : : {
4535 : : int i;
4536 : : SMgrSortArray *srels;
4537 : : bool use_bsearch;
4538 : :
4539 [ - + ]: 9 : if (nrels == 0)
1471 noah@leadboat.com 4540 :UBC 0 : return;
4541 : :
4542 : : /* fill-in array for qsort */
1471 noah@leadboat.com 4543 :CBC 9 : srels = palloc(sizeof(SMgrSortArray) * nrels);
4544 : :
4545 [ + + ]: 18 : for (i = 0; i < nrels; i++)
4546 : : {
648 rhaas@postgresql.org 4547 [ - + ]: 9 : Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4548 : :
4549 : 9 : srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
1471 noah@leadboat.com 4550 : 9 : srels[i].srel = smgrs[i];
4551 : : }
4552 : :
4553 : : /*
4554 : : * Save the bsearch overhead for low number of relations to sync. See
4555 : : * DropRelationsAllBuffers for details.
4556 : : */
4557 : 9 : use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4558 : :
4559 : : /* sort the list of SMgrRelations if necessary */
4560 [ - + ]: 9 : if (use_bsearch)
58 nathan@postgresql.or 4561 :UNC 0 : qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4562 : :
1471 noah@leadboat.com 4563 [ + + ]:CBC 147465 : for (i = 0; i < NBuffers; i++)
4564 : : {
4565 : 147456 : SMgrSortArray *srelent = NULL;
4566 : 147456 : BufferDesc *bufHdr = GetBufferDescriptor(i);
4567 : : uint32 buf_state;
4568 : :
4569 : : /*
4570 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4571 : : * saves some cycles.
4572 : : */
4573 : :
4574 [ + - ]: 147456 : if (!use_bsearch)
4575 : : {
4576 : : int j;
4577 : :
4578 [ + + ]: 291094 : for (j = 0; j < nrels; j++)
4579 : : {
599 rhaas@postgresql.org 4580 [ + + ]: 147456 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4581 : : {
1471 noah@leadboat.com 4582 : 3818 : srelent = &srels[j];
4583 : 3818 : break;
4584 : : }
4585 : : }
4586 : : }
4587 : : else
4588 : : {
4589 : : RelFileLocator rlocator;
4590 : :
599 rhaas@postgresql.org 4591 :UBC 0 : rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4592 : 0 : srelent = bsearch((const void *) &(rlocator),
4593 : : srels, nrels, sizeof(SMgrSortArray),
4594 : : rlocator_comparator);
4595 : : }
4596 : :
4597 : : /* buffer doesn't belong to any of the given relfilelocators; skip it */
1471 noah@leadboat.com 4598 [ + + ]:CBC 147456 : if (srelent == NULL)
4599 : 143638 : continue;
4600 : :
4601 : : /* Make sure we can handle the pin */
4602 : 3818 : ReservePrivateRefCountEntry();
158 heikki.linnakangas@i 4603 :GNC 3818 : ResourceOwnerEnlarge(CurrentResourceOwner);
4604 : :
1471 noah@leadboat.com 4605 :CBC 3818 : buf_state = LockBufHdr(bufHdr);
599 rhaas@postgresql.org 4606 [ + - ]: 3818 : if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
1471 noah@leadboat.com 4607 [ + + ]: 3818 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4608 : : {
4609 : 3367 : PinBuffer_Locked(bufHdr);
4610 : 3367 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
430 andres@anarazel.de 4611 : 3367 : FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
1471 noah@leadboat.com 4612 : 3367 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
562 michael@paquier.xyz 4613 : 3367 : UnpinBuffer(bufHdr);
4614 : : }
4615 : : else
1471 noah@leadboat.com 4616 : 451 : UnlockBufHdr(bufHdr, buf_state);
4617 : : }
4618 : :
4619 : 9 : pfree(srels);
4620 : : }
4621 : :
4622 : : /* ---------------------------------------------------------------------
4623 : : * RelationCopyStorageUsingBuffer
4624 : : *
4625 : : * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
4626 : : * of using smgrread and smgrextend this will copy using bufmgr APIs.
4627 : : *
4628 : : * Refer comments atop CreateAndCopyRelationData() for details about
4629 : : * 'permanent' parameter.
4630 : : * --------------------------------------------------------------------
4631 : : */
4632 : : static void
611 rhaas@postgresql.org 4633 : 64249 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
4634 : : RelFileLocator dstlocator,
4635 : : ForkNumber forkNum, bool permanent)
4636 : : {
4637 : : Buffer srcBuf;
4638 : : Buffer dstBuf;
4639 : : Page srcPage;
4640 : : Page dstPage;
4641 : : bool use_wal;
4642 : : BlockNumber nblocks;
4643 : : BlockNumber blkno;
4644 : : PGIOAlignedBlock buf;
4645 : : BufferAccessStrategy bstrategy_src;
4646 : : BufferAccessStrategy bstrategy_dst;
4647 : :
4648 : : /*
4649 : : * In general, we want to write WAL whenever wal_level > 'minimal', but we
4650 : : * can skip it when copying any fork of an unlogged relation other than
4651 : : * the init fork.
4652 : : */
747 4653 [ + + - + : 64249 : use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
- - ]
4654 : :
4655 : : /* Get number of blocks in the source relation. */
42 heikki.linnakangas@i 4656 :GNC 64249 : nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
4657 : : forkNum);
4658 : :
4659 : : /* Nothing to copy; just return. */
747 rhaas@postgresql.org 4660 [ + + ]:CBC 64249 : if (nblocks == 0)
4661 : 11066 : return;
4662 : :
4663 : : /*
4664 : : * Bulk extend the destination relation of the same size as the source
4665 : : * relation before starting to copy block by block.
4666 : : */
605 4667 : 53183 : memset(buf.data, 0, BLCKSZ);
42 heikki.linnakangas@i 4668 :GNC 53183 : smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
4669 : : buf.data, true);
4670 : :
4671 : : /* This is a bulk operation, so use buffer access strategies. */
747 rhaas@postgresql.org 4672 :CBC 53183 : bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
4673 : 53183 : bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
4674 : :
4675 : : /* Iterate over each block of the source relation file. */
4676 [ + + ]: 256198 : for (blkno = 0; blkno < nblocks; blkno++)
4677 : : {
4678 [ - + ]: 203015 : CHECK_FOR_INTERRUPTS();
4679 : :
4680 : : /* Read block from source relation. */
611 4681 : 203015 : srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
4682 : : RBM_NORMAL, bstrategy_src,
4683 : : permanent);
617 tgl@sss.pgh.pa.us 4684 : 203015 : LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
747 rhaas@postgresql.org 4685 : 203015 : srcPage = BufferGetPage(srcBuf);
4686 : :
605 4687 : 203015 : dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
4688 : : RBM_ZERO_AND_LOCK, bstrategy_dst,
4689 : : permanent);
617 tgl@sss.pgh.pa.us 4690 : 203015 : dstPage = BufferGetPage(dstBuf);
4691 : :
747 rhaas@postgresql.org 4692 : 203015 : START_CRIT_SECTION();
4693 : :
4694 : : /* Copy page data from the source to the destination. */
4695 : 203015 : memcpy(dstPage, srcPage, BLCKSZ);
4696 : 203015 : MarkBufferDirty(dstBuf);
4697 : :
4698 : : /* WAL-log the copied page. */
4699 [ + + ]: 203015 : if (use_wal)
4700 : 114168 : log_newpage_buffer(dstBuf, true);
4701 : :
4702 [ - + ]: 203015 : END_CRIT_SECTION();
4703 : :
4704 : 203015 : UnlockReleaseBuffer(dstBuf);
617 tgl@sss.pgh.pa.us 4705 : 203015 : UnlockReleaseBuffer(srcBuf);
4706 : : }
4707 : :
391 andres@anarazel.de 4708 : 53183 : FreeAccessStrategy(bstrategy_src);
4709 : 53183 : FreeAccessStrategy(bstrategy_dst);
4710 : : }
4711 : :
4712 : : /* ---------------------------------------------------------------------
4713 : : * CreateAndCopyRelationData
4714 : : *
4715 : : * Create destination relation storage and copy all forks from the
4716 : : * source relation to the destination.
4717 : : *
4718 : : * Pass permanent as true for permanent relations and false for
4719 : : * unlogged relations. Currently this API is not supported for
4720 : : * temporary relations.
4721 : : * --------------------------------------------------------------------
4722 : : */
4723 : : void
648 rhaas@postgresql.org 4724 : 48189 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
4725 : : RelFileLocator dst_rlocator, bool permanent)
4726 : : {
4727 : : char relpersistence;
4728 : : SMgrRelation src_rel;
4729 : : SMgrRelation dst_rel;
4730 : :
4731 : : /* Set the relpersistence. */
747 4732 [ + - ]: 48189 : relpersistence = permanent ?
4733 : : RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4734 : :
42 heikki.linnakangas@i 4735 :GNC 48189 : src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4736 : 48189 : dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4737 : :
4738 : : /*
4739 : : * Create and copy all forks of the relation. During create database we
4740 : : * have a separate cleanup mechanism which deletes complete database
4741 : : * directory. Therefore, each individual relation doesn't need to be
4742 : : * registered for cleanup.
4743 : : */
648 rhaas@postgresql.org 4744 :CBC 48189 : RelationCreateStorage(dst_rlocator, relpersistence, false);
4745 : :
4746 : : /* copy main fork. */
611 4747 : 48189 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4748 : : permanent);
4749 : :
4750 : : /* copy those extra forks that exist */
747 4751 : 48189 : for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4752 [ + + ]: 192756 : forkNum <= MAX_FORKNUM; forkNum++)
4753 : : {
74 heikki.linnakangas@i 4754 [ + + ]:GNC 144567 : if (smgrexists(src_rel, forkNum))
4755 : : {
4756 : 16060 : smgrcreate(dst_rel, forkNum, false);
4757 : :
4758 : : /*
4759 : : * WAL log creation if the relation is persistent, or this is the
4760 : : * init fork of an unlogged relation.
4761 : : */
747 rhaas@postgresql.org 4762 [ - + - - ]:CBC 16060 : if (permanent || forkNum == INIT_FORKNUM)
648 4763 : 16060 : log_smgrcreate(&dst_rlocator, forkNum);
4764 : :
4765 : : /* Copy a fork's data, block by block. */
611 4766 : 16060 : RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4767 : : permanent);
4768 : : }
4769 : : }
747 4770 : 48189 : }
4771 : :
4772 : : /* ---------------------------------------------------------------------
4773 : : * FlushDatabaseBuffers
4774 : : *
4775 : : * This function writes all dirty pages of a database out to disk
4776 : : * (or more accurately, out to kernel disk buffers), ensuring that the
4777 : : * kernel has an up-to-date view of the database.
4778 : : *
4779 : : * Generally, the caller should be holding an appropriate lock to ensure
4780 : : * no other backend is active in the target database; otherwise more
4781 : : * pages could get dirtied.
4782 : : *
4783 : : * Note we don't worry about flushing any pages of temporary relations.
4784 : : * It's assumed these wouldn't be interesting.
4785 : : * --------------------------------------------------------------------
4786 : : */
4787 : : void
6135 tgl@sss.pgh.pa.us 4788 : 16 : FlushDatabaseBuffers(Oid dbid)
4789 : : {
4790 : : int i;
4791 : : BufferDesc *bufHdr;
4792 : :
4793 [ + + ]: 2064 : for (i = 0; i < NBuffers; i++)
4794 : : {
4795 : : uint32 buf_state;
4796 : :
3363 andres@anarazel.de 4797 : 2048 : bufHdr = GetBufferDescriptor(i);
4798 : :
4799 : : /*
4800 : : * As in DropRelationBuffers, an unlocked precheck should be safe and
4801 : : * saves some cycles.
4802 : : */
599 rhaas@postgresql.org 4803 [ + + ]: 2048 : if (bufHdr->tag.dbOid != dbid)
4329 tgl@sss.pgh.pa.us 4804 : 1668 : continue;
4805 : :
4806 : : /* Make sure we can handle the pin */
3373 andres@anarazel.de 4807 : 380 : ReservePrivateRefCountEntry();
158 heikki.linnakangas@i 4808 :GNC 380 : ResourceOwnerEnlarge(CurrentResourceOwner);
4809 : :
2926 andres@anarazel.de 4810 :CBC 380 : buf_state = LockBufHdr(bufHdr);
599 rhaas@postgresql.org 4811 [ + - ]: 380 : if (bufHdr->tag.dbOid == dbid &&
2926 andres@anarazel.de 4812 [ + + ]: 380 : (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4813 : : {
6135 tgl@sss.pgh.pa.us 4814 : 288 : PinBuffer_Locked(bufHdr);
3043 rhaas@postgresql.org 4815 : 288 : LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
430 andres@anarazel.de 4816 : 288 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3043 rhaas@postgresql.org 4817 : 288 : LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
562 michael@paquier.xyz 4818 : 288 : UnpinBuffer(bufHdr);
4819 : : }
4820 : : else
2926 andres@anarazel.de 4821 : 92 : UnlockBufHdr(bufHdr, buf_state);
4822 : : }
6135 tgl@sss.pgh.pa.us 4823 : 16 : }
4824 : :
4825 : : /*
4826 : : * Flush a previously, shared or exclusively, locked and pinned buffer to the
4827 : : * OS.
4828 : : */
4829 : : void
3048 andres@anarazel.de 4830 : 29 : FlushOneBuffer(Buffer buffer)
4831 : : {
4832 : : BufferDesc *bufHdr;
4833 : :
4834 : : /* currently not needed, but no fundamental reason not to support */
4835 [ - + ]: 29 : Assert(!BufferIsLocal(buffer));
4836 : :
4837 [ - + - + : 29 : Assert(BufferIsPinned(buffer));
- + ]
4838 : :
4839 : 29 : bufHdr = GetBufferDescriptor(buffer - 1);
4840 : :
3043 rhaas@postgresql.org 4841 [ - + ]: 29 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4842 : :
430 andres@anarazel.de 4843 : 29 : FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
3048 4844 : 29 : }
4845 : :
4846 : : /*
4847 : : * ReleaseBuffer -- release the pin on a buffer
4848 : : */
4849 : : void
7922 tgl@sss.pgh.pa.us 4850 : 52419906 : ReleaseBuffer(Buffer buffer)
4851 : : {
7121 4852 [ - + ]: 52419906 : if (!BufferIsValid(buffer))
4683 peter_e@gmx.net 4853 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4854 : :
9716 bruce@momjian.us 4855 [ + + ]:CBC 52419906 : if (BufferIsLocal(buffer))
375 andres@anarazel.de 4856 : 1510442 : UnpinLocalBuffer(buffer);
4857 : : else
4858 : 50909464 : UnpinBuffer(GetBufferDescriptor(buffer - 1));
10141 scrappy@hub.org 4859 : 52419906 : }
4860 : :
4861 : : /*
4862 : : * UnlockReleaseBuffer -- release the content lock and pin on a buffer
4863 : : *
4864 : : * This is just a shorthand for a common combination.
4865 : : */
4866 : : void
6589 tgl@sss.pgh.pa.us 4867 : 15729463 : UnlockReleaseBuffer(Buffer buffer)
4868 : : {
4869 : 15729463 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4870 : 15729463 : ReleaseBuffer(buffer);
4871 : 15729463 : }
4872 : :
4873 : : /*
4874 : : * IncrBufferRefCount
4875 : : * Increment the pin count on a buffer that we have *already* pinned
4876 : : * at least once.
4877 : : *
4878 : : * This function cannot be used on a buffer we do not have pinned,
4879 : : * because it doesn't change the shared buffer state.
4880 : : */
4881 : : void
7211 4882 : 9558731 : IncrBufferRefCount(Buffer buffer)
4883 : : {
7081 neilc@samurai.com 4884 [ - + + + : 9558731 : Assert(BufferIsPinned(buffer));
- + ]
158 heikki.linnakangas@i 4885 :GNC 9558731 : ResourceOwnerEnlarge(CurrentResourceOwner);
7211 tgl@sss.pgh.pa.us 4886 [ + + ]:CBC 9558731 : if (BufferIsLocal(buffer))
4887 : 347096 : LocalRefCount[-buffer - 1]++;
4888 : : else
4889 : : {
4890 : : PrivateRefCountEntry *ref;
4891 : :
3373 andres@anarazel.de 4892 : 9211635 : ref = GetPrivateRefCountEntry(buffer, true);
3515 4893 [ - + ]: 9211635 : Assert(ref != NULL);
4894 : 9211635 : ref->refcount++;
4895 : : }
2349 tgl@sss.pgh.pa.us 4896 : 9558731 : ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
7211 4897 : 9558731 : }
4898 : :
4899 : : /*
4900 : : * MarkBufferDirtyHint
4901 : : *
4902 : : * Mark a buffer dirty for non-critical changes.
4903 : : *
4904 : : * This is essentially the same as MarkBufferDirty, except:
4905 : : *
4906 : : * 1. The caller does not write WAL; so if checksums are enabled, we may need
4907 : : * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
4908 : : * 2. The caller might have only share-lock instead of exclusive-lock on the
4909 : : * buffer's content lock.
4910 : : * 3. This function does not guarantee that the buffer is always marked dirty
4911 : : * (due to a race condition), so it cannot be used for important changes.
4912 : : */
4913 : : void
3954 jdavis@postgresql.or 4914 : 9470838 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
4915 : : {
4916 : : BufferDesc *bufHdr;
2916 kgrittn@postgresql.o 4917 : 9470838 : Page page = BufferGetPage(buffer);
4918 : :
7121 tgl@sss.pgh.pa.us 4919 [ - + ]: 9470838 : if (!BufferIsValid(buffer))
4683 peter_e@gmx.net 4920 [ # # ]:UBC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
4921 : :
8771 tgl@sss.pgh.pa.us 4922 [ + + ]:CBC 9470838 : if (BufferIsLocal(buffer))
4923 : : {
6589 4924 : 679437 : MarkLocalBufferDirty(buffer);
8771 4925 : 679437 : return;
4926 : : }
4927 : :
3363 andres@anarazel.de 4928 : 8791401 : bufHdr = GetBufferDescriptor(buffer - 1);
4929 : :
3515 4930 [ - + ]: 8791401 : Assert(GetPrivateRefCount(buffer) > 0);
4931 : : /* here, either share or exclusive lock is OK */
3043 rhaas@postgresql.org 4932 [ - + ]: 8791401 : Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
4933 : :
4934 : : /*
4935 : : * This routine might get called many times on the same page, if we are
4936 : : * making the first scan after commit of an xact that added/deleted many
4937 : : * tuples. So, be as quick as we can if the buffer is already dirty. We
4938 : : * do this by not acquiring spinlock if it looks like the status bits are
4939 : : * already set. Since we make this test unlocked, there's a chance we
4940 : : * might fail to notice that the flags have just been cleared, and failed
4941 : : * to reset them, due to memory-ordering issues. But since this function
4942 : : * is only intended to be used in cases where failing to write out the
4943 : : * data would be harmless anyway, it doesn't really matter.
4944 : : */
2926 andres@anarazel.de 4945 [ + + ]: 8791401 : if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4946 : : (BM_DIRTY | BM_JUST_DIRTIED))
4947 : : {
4041 simon@2ndQuadrant.co 4948 : 927733 : XLogRecPtr lsn = InvalidXLogRecPtr;
4949 : 927733 : bool dirtied = false;
737 rhaas@postgresql.org 4950 : 927733 : bool delayChkptFlags = false;
4951 : : uint32 buf_state;
4952 : :
4953 : : /*
4954 : : * If we need to protect hint bit updates from torn writes, WAL-log a
4955 : : * full page image of the page. This full page image is only necessary
4956 : : * if the hint bit update is the first change to the page since the
4957 : : * last checkpoint.
4958 : : *
4959 : : * We don't check full_page_writes here because that logic is included
4960 : : * when we call XLogInsert() since the value changes dynamically.
4961 : : */
2926 andres@anarazel.de 4962 [ + + + + : 1841274 : if (XLogHintBitIsNeeded() &&
+ + ]
4963 : 913541 : (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
4964 : : {
4965 : : /*
4966 : : * If we must not write WAL, due to a relfilelocator-specific
4967 : : * condition or being in recovery, don't dirty the page. We can
4968 : : * set the hint, just not dirty the page as a result so the hint
4969 : : * is lost when we evict the page or shutdown.
4970 : : *
4971 : : * See src/backend/storage/page/README for longer discussion.
4972 : : */
1471 noah@leadboat.com 4973 [ + + - + ]: 953519 : if (RecoveryInProgress() ||
599 rhaas@postgresql.org 4974 : 39981 : RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
4041 simon@2ndQuadrant.co 4975 : 873557 : return;
4976 : :
4977 : : /*
4978 : : * If the block is already dirty because we either made a change
4979 : : * or set a hint already, then we don't need to write a full page
4980 : : * image. Note that aggressive cleaning of blocks dirtied by hint
4981 : : * bit setting would increase the call rate. Bulk setting of hint
4982 : : * bits would reduce the call rate...
4983 : : *
4984 : : * We must issue the WAL record before we mark the buffer dirty.
4985 : : * Otherwise we might write the page before we write the WAL. That
4986 : : * causes a race condition, since a checkpoint might occur between
4987 : : * writing the WAL record and marking the buffer dirty. We solve
4988 : : * that with a kluge, but one that is already in use during
4989 : : * transaction commit to prevent race conditions. Basically, we
4990 : : * simply prevent the checkpoint WAL record from being written
4991 : : * until we have marked the buffer dirty. We don't start the
4992 : : * checkpoint flush until we have marked dirty, so our checkpoint
4993 : : * must flush the change to disk successfully or the checkpoint
4994 : : * never gets written, so crash recovery will fix.
4995 : : *
4996 : : * It's possible we may enter here without an xid, so it is
4997 : : * essential that CreateCheckPoint waits for virtual transactions
4998 : : * rather than full transactionids.
4999 : : */
737 rhaas@postgresql.org 5000 [ - + ]: 39981 : Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
5001 : 39981 : MyProc->delayChkptFlags |= DELAY_CHKPT_START;
5002 : 39981 : delayChkptFlags = true;
3954 jdavis@postgresql.or 5003 : 39981 : lsn = XLogSaveBufferForHint(buffer, buffer_std);
5004 : : }
5005 : :
2926 andres@anarazel.de 5006 : 54176 : buf_state = LockBufHdr(bufHdr);
5007 : :
5008 [ - + ]: 54176 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5009 : :
5010 [ + + ]: 54176 : if (!(buf_state & BM_DIRTY))
5011 : : {
4041 simon@2ndQuadrant.co 5012 : 54157 : dirtied = true; /* Means "will be dirtied by this action" */
5013 : :
5014 : : /*
5015 : : * Set the page LSN if we wrote a backup block. We aren't supposed
5016 : : * to set this when only holding a share lock but as long as we
5017 : : * serialise it somehow we're OK. We choose to set LSN while
5018 : : * holding the buffer header lock, which causes any reader of an
5019 : : * LSN who holds only a share lock to also obtain a buffer header
5020 : : * lock before using PageGetLSN(), which is enforced in
5021 : : * BufferGetLSNAtomic().
5022 : : *
5023 : : * If checksums are enabled, you might think we should reset the
5024 : : * checksum here. That will happen when the page is written
5025 : : * sometime later in this checkpoint cycle.
5026 : : */
5027 [ + + ]: 54157 : if (!XLogRecPtrIsInvalid(lsn))
5028 : 6970 : PageSetLSN(page, lsn);
5029 : : }
5030 : :
2926 andres@anarazel.de 5031 : 54176 : buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5032 : 54176 : UnlockBufHdr(bufHdr, buf_state);
5033 : :
737 rhaas@postgresql.org 5034 [ + + ]: 54176 : if (delayChkptFlags)
5035 : 39981 : MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5036 : :
4041 simon@2ndQuadrant.co 5037 [ + + ]: 54176 : if (dirtied)
5038 : : {
4524 alvherre@alvh.no-ip. 5039 : 54157 : VacuumPageDirty++;
3667 rhaas@postgresql.org 5040 : 54157 : pgBufferUsage.shared_blks_dirtied++;
4524 alvherre@alvh.no-ip. 5041 [ + + ]: 54157 : if (VacuumCostActive)
5042 : 1860 : VacuumCostBalance += VacuumCostPageDirty;
5043 : : }
5044 : : }
5045 : : }
5046 : :
5047 : : /*
5048 : : * Release buffer content locks for shared buffers.
5049 : : *
5050 : : * Used to clean up after errors.
5051 : : *
5052 : : * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
5053 : : * of releasing buffer content locks per se; the only thing we need to deal
5054 : : * with here is clearing any PIN_COUNT request that was in progress.
5055 : : */
5056 : : void
8493 tgl@sss.pgh.pa.us 5057 : 45427 : UnlockBuffers(void)
5058 : : {
3072 rhaas@postgresql.org 5059 : 45427 : BufferDesc *buf = PinCountWaitBuf;
5060 : :
7120 tgl@sss.pgh.pa.us 5061 [ - + ]: 45427 : if (buf)
5062 : : {
5063 : : uint32 buf_state;
5064 : :
2926 andres@anarazel.de 5065 :UBC 0 : buf_state = LockBufHdr(buf);
5066 : :
5067 : : /*
5068 : : * Don't complain if flag bit not set; it could have been reset but we
5069 : : * got a cancel/die interrupt before getting the signal.
5070 : : */
5071 [ # # ]: 0 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
52 heikki.linnakangas@i 5072 [ # # ]:UNC 0 : buf->wait_backend_pgprocno == MyProcNumber)
2926 andres@anarazel.de 5073 :UBC 0 : buf_state &= ~BM_PIN_COUNT_WAITER;
5074 : :
5075 : 0 : UnlockBufHdr(buf, buf_state);
5076 : :
6981 tgl@sss.pgh.pa.us 5077 : 0 : PinCountWaitBuf = NULL;
5078 : : }
9252 vadim4o@yahoo.com 5079 :CBC 45427 : }
5080 : :
5081 : : /*
5082 : : * Acquire or release the content_lock for the buffer.
5083 : : */
5084 : : void
9091 bruce@momjian.us 5085 : 150592046 : LockBuffer(Buffer buffer, int mode)
5086 : : {
5087 : : BufferDesc *buf;
5088 : :
1364 pg@bowt.ie 5089 [ - + + + : 150592046 : Assert(BufferIsPinned(buffer));
- + ]
9252 vadim4o@yahoo.com 5090 [ + + ]: 150592046 : if (BufferIsLocal(buffer))
6589 tgl@sss.pgh.pa.us 5091 : 9828845 : return; /* local buffers need no lock */
5092 : :
3363 andres@anarazel.de 5093 : 140763201 : buf = GetBufferDescriptor(buffer - 1);
5094 : :
9252 vadim4o@yahoo.com 5095 [ + + ]: 140763201 : if (mode == BUFFER_LOCK_UNLOCK)
3043 rhaas@postgresql.org 5096 : 71091205 : LWLockRelease(BufferDescriptorGetContentLock(buf));
9252 vadim4o@yahoo.com 5097 [ + + ]: 69671996 : else if (mode == BUFFER_LOCK_SHARE)
3043 rhaas@postgresql.org 5098 : 49703457 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
9252 vadim4o@yahoo.com 5099 [ + - ]: 19968539 : else if (mode == BUFFER_LOCK_EXCLUSIVE)
3043 rhaas@postgresql.org 5100 : 19968539 : LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
5101 : : else
7570 tgl@sss.pgh.pa.us 5102 [ # # ]:UBC 0 : elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5103 : : }
5104 : :
5105 : : /*
5106 : : * Acquire the content_lock for the buffer, but only if we don't have to wait.
5107 : : *
5108 : : * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
5109 : : */
5110 : : bool
7553 tgl@sss.pgh.pa.us 5111 :CBC 1309073 : ConditionalLockBuffer(Buffer buffer)
5112 : : {
5113 : : BufferDesc *buf;
5114 : :
1364 pg@bowt.ie 5115 [ - + + + : 1309073 : Assert(BufferIsPinned(buffer));
- + ]
7553 tgl@sss.pgh.pa.us 5116 [ + + ]: 1309073 : if (BufferIsLocal(buffer))
5117 : 64786 : return true; /* act as though we got it */
5118 : :
3363 andres@anarazel.de 5119 : 1244287 : buf = GetBufferDescriptor(buffer - 1);
5120 : :
3043 rhaas@postgresql.org 5121 : 1244287 : return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
5122 : : LW_EXCLUSIVE);
5123 : : }
5124 : :
5125 : : /*
5126 : : * Verify that this backend is pinning the buffer exactly once.
5127 : : *
5128 : : * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
5129 : : * holds a pin on the buffer. We do not care whether some other backend does.
5130 : : */
5131 : : void
375 andres@anarazel.de 5132 : 3191689 : CheckBufferIsPinnedOnce(Buffer buffer)
5133 : : {
5134 [ + + ]: 3191689 : if (BufferIsLocal(buffer))
5135 : : {
5136 [ - + ]: 16 : if (LocalRefCount[-buffer - 1] != 1)
375 andres@anarazel.de 5137 [ # # ]:UBC 0 : elog(ERROR, "incorrect local pin count: %d",
5138 : : LocalRefCount[-buffer - 1]);
5139 : : }
5140 : : else
5141 : : {
375 andres@anarazel.de 5142 [ - + ]:CBC 3191673 : if (GetPrivateRefCount(buffer) != 1)
375 andres@anarazel.de 5143 [ # # ]:UBC 0 : elog(ERROR, "incorrect local pin count: %d",
5144 : : GetPrivateRefCount(buffer));
5145 : : }
375 andres@anarazel.de 5146 :CBC 3191689 : }
5147 : :
5148 : : /*
5149 : : * LockBufferForCleanup - lock a buffer in preparation for deleting items
5150 : : *
5151 : : * Items may be deleted from a disk page only when the caller (a) holds an
5152 : : * exclusive lock on the buffer and (b) has observed that no other backend
5153 : : * holds a pin on the buffer. If there is a pin, then the other backend
5154 : : * might have a pointer into the buffer (for example, a heapscan reference
5155 : : * to an item --- see README for more details). It's OK if a pin is added
5156 : : * after the cleanup starts, however; the newly-arrived backend will be
5157 : : * unable to look at the page until we release the exclusive lock.
5158 : : *
5159 : : * To implement this protocol, a would-be deleter must pin the buffer and
5160 : : * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
5161 : : * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
5162 : : * it has successfully observed pin count = 1.
5163 : : */
5164 : : void
8318 tgl@sss.pgh.pa.us 5165 : 21773 : LockBufferForCleanup(Buffer buffer)
5166 : : {
5167 : : BufferDesc *bufHdr;
1192 fujii@postgresql.org 5168 : 21773 : TimestampTz waitStart = 0;
419 drowley@postgresql.o 5169 : 21773 : bool waiting = false;
1192 fujii@postgresql.org 5170 : 21773 : bool logged_recovery_conflict = false;
5171 : :
1364 pg@bowt.ie 5172 [ - + + + : 21773 : Assert(BufferIsPinned(buffer));
- + ]
7120 tgl@sss.pgh.pa.us 5173 [ - + ]: 21773 : Assert(PinCountWaitBuf == NULL);
5174 : :
375 andres@anarazel.de 5175 : 21773 : CheckBufferIsPinnedOnce(buffer);
5176 : :
5177 : : /* Nobody else to wait for */
8318 tgl@sss.pgh.pa.us 5178 [ + + ]: 21773 : if (BufferIsLocal(buffer))
5179 : 16 : return;
5180 : :
3363 andres@anarazel.de 5181 : 21757 : bufHdr = GetBufferDescriptor(buffer - 1);
5182 : :
5183 : : for (;;)
8318 tgl@sss.pgh.pa.us 5184 :GBC 11 : {
5185 : : uint32 buf_state;
5186 : :
5187 : : /* Try to acquire lock */
8318 tgl@sss.pgh.pa.us 5188 :CBC 21768 : LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
2926 andres@anarazel.de 5189 : 21768 : buf_state = LockBufHdr(bufHdr);
5190 : :
5191 [ - + ]: 21768 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5192 [ + + ]: 21768 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5193 : : {
5194 : : /* Successfully acquired exclusive lock with pincount 1 */
5195 : 21757 : UnlockBufHdr(bufHdr, buf_state);
5196 : :
5197 : : /*
5198 : : * Emit the log message if recovery conflict on buffer pin was
5199 : : * resolved but the startup process waited longer than
5200 : : * deadlock_timeout for it.
5201 : : */
1187 fujii@postgresql.org 5202 [ + + ]: 21757 : if (logged_recovery_conflict)
1187 fujii@postgresql.org 5203 :GBC 2 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5204 : : waitStart, GetCurrentTimestamp(),
5205 : : NULL, false);
5206 : :
419 drowley@postgresql.o 5207 [ + + ]:CBC 21757 : if (waiting)
5208 : : {
5209 : : /* reset ps display to remove the suffix if we added one */
419 drowley@postgresql.o 5210 :GBC 2 : set_ps_display_remove_suffix();
5211 : 2 : waiting = false;
5212 : : }
8318 tgl@sss.pgh.pa.us 5213 :CBC 21757 : return;
5214 : : }
5215 : : /* Failed, so mark myself as waiting for pincount 1 */
2926 andres@anarazel.de 5216 [ - + ]:GBC 11 : if (buf_state & BM_PIN_COUNT_WAITER)
5217 : : {
2926 andres@anarazel.de 5218 :UBC 0 : UnlockBufHdr(bufHdr, buf_state);
8318 tgl@sss.pgh.pa.us 5219 : 0 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
7570 5220 [ # # ]: 0 : elog(ERROR, "multiple backends attempting to wait for pincount 1");
5221 : : }
52 heikki.linnakangas@i 5222 :GNC 11 : bufHdr->wait_backend_pgprocno = MyProcNumber;
7120 tgl@sss.pgh.pa.us 5223 :GBC 11 : PinCountWaitBuf = bufHdr;
2926 andres@anarazel.de 5224 : 11 : buf_state |= BM_PIN_COUNT_WAITER;
5225 : 11 : UnlockBufHdr(bufHdr, buf_state);
8318 tgl@sss.pgh.pa.us 5226 : 11 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5227 : :
5228 : : /* Wait to be signaled by UnpinBuffer() */
5195 simon@2ndQuadrant.co 5229 [ + - ]: 11 : if (InHotStandby)
5230 : : {
419 drowley@postgresql.o 5231 [ + + ]: 11 : if (!waiting)
5232 : : {
5233 : : /* adjust the process title to indicate that it's waiting */
5234 : 2 : set_ps_display_suffix("waiting");
5235 : 2 : waiting = true;
5236 : : }
5237 : :
5238 : : /*
5239 : : * Emit the log message if the startup process is waiting longer
5240 : : * than deadlock_timeout for recovery conflict on buffer pin.
5241 : : *
5242 : : * Skip this if first time through because the startup process has
5243 : : * not started waiting yet in this case. So, the wait start
5244 : : * timestamp is set after this logic.
5245 : : */
1192 fujii@postgresql.org 5246 [ + + + + ]: 11 : if (waitStart != 0 && !logged_recovery_conflict)
5247 : : {
5248 : 4 : TimestampTz now = GetCurrentTimestamp();
5249 : :
5250 [ + + ]: 4 : if (TimestampDifferenceExceeds(waitStart, now,
5251 : : DeadlockTimeout))
5252 : : {
5253 : 2 : LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
5254 : : waitStart, now, NULL, true);
5255 : 2 : logged_recovery_conflict = true;
5256 : : }
5257 : : }
5258 : :
5259 : : /*
5260 : : * Set the wait start timestamp if logging is enabled and first
5261 : : * time through.
5262 : : */
5263 [ + - + + ]: 11 : if (log_recovery_conflict_waits && waitStart == 0)
5264 : 2 : waitStart = GetCurrentTimestamp();
5265 : :
5266 : : /* Publish the bufid that Startup process waits on */
5195 simon@2ndQuadrant.co 5267 : 11 : SetStartupBufferPinWaitBufId(buffer - 1);
5268 : : /* Set alarm and then wait to be signaled by UnpinBuffer() */
5269 : 11 : ResolveRecoveryConflictWithBufferPin();
5270 : : /* Reset the published bufid */
5271 : 11 : SetStartupBufferPinWaitBufId(-1);
5272 : : }
5273 : : else
286 michael@paquier.xyz 5274 :UNC 0 : ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5275 : :
5276 : : /*
5277 : : * Remove flag marking us as waiter. Normally this will not be set
5278 : : * anymore, but ProcWaitForSignal() can return for other signals as
5279 : : * well. We take care to only reset the flag if we're the waiter, as
5280 : : * theoretically another backend could have started waiting. That's
5281 : : * impossible with the current usages due to table level locking, but
5282 : : * better be safe.
5283 : : */
2926 andres@anarazel.de 5284 :GBC 11 : buf_state = LockBufHdr(bufHdr);
5285 [ + + ]: 11 : if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
52 heikki.linnakangas@i 5286 [ + - ]:GNC 9 : bufHdr->wait_backend_pgprocno == MyProcNumber)
2926 andres@anarazel.de 5287 :GBC 9 : buf_state &= ~BM_PIN_COUNT_WAITER;
5288 : 11 : UnlockBufHdr(bufHdr, buf_state);
5289 : :
7120 tgl@sss.pgh.pa.us 5290 : 11 : PinCountWaitBuf = NULL;
5291 : : /* Loop back and try again */
5292 : : }
5293 : : }
5294 : :
5295 : : /*
5296 : : * Check called from ProcessRecoveryConflictInterrupts() when Startup process
5297 : : * requests cancellation of all pin holders that are blocking it.
5298 : : */
5299 : : bool
5195 simon@2ndQuadrant.co 5300 : 4 : HoldingBufferPinThatDelaysRecovery(void)
5301 : : {
5161 bruce@momjian.us 5302 : 4 : int bufid = GetStartupBufferPinWaitBufId();
5303 : :
5304 : : /*
5305 : : * If we get woken slowly then it's possible that the Startup process was
5306 : : * already woken by other backends before we got here. Also possible that
5307 : : * we get here by multiple interrupts or interrupts at inappropriate
5308 : : * times, so make sure we do nothing if the bufid is not set.
5309 : : */
5195 simon@2ndQuadrant.co 5310 [ + + ]: 4 : if (bufid < 0)
5311 : 2 : return false;
5312 : :
3515 andres@anarazel.de 5313 [ + - ]: 2 : if (GetPrivateRefCount(bufid + 1) > 0)
5195 simon@2ndQuadrant.co 5314 : 2 : return true;
5315 : :
5195 simon@2ndQuadrant.co 5316 :UBC 0 : return false;
5317 : : }
5318 : :
5319 : : /*
5320 : : * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
5321 : : *
5322 : : * We won't loop, but just check once to see if the pin count is OK. If
5323 : : * not, return false with no lock held.
5324 : : */
5325 : : bool
6051 tgl@sss.pgh.pa.us 5326 :CBC 363571 : ConditionalLockBufferForCleanup(Buffer buffer)
5327 : : {
5328 : : BufferDesc *bufHdr;
5329 : : uint32 buf_state,
5330 : : refcount;
5331 : :
5332 [ - + ]: 363571 : Assert(BufferIsValid(buffer));
5333 : :
5334 [ + + ]: 363571 : if (BufferIsLocal(buffer))
5335 : : {
2926 andres@anarazel.de 5336 : 785 : refcount = LocalRefCount[-buffer - 1];
5337 : : /* There should be exactly one pin */
5338 [ - + ]: 785 : Assert(refcount > 0);
5339 [ + + ]: 785 : if (refcount != 1)
6051 tgl@sss.pgh.pa.us 5340 : 21 : return false;
5341 : : /* Nobody else to wait for */
5342 : 764 : return true;
5343 : : }
5344 : :
5345 : : /* There should be exactly one local pin */
2926 andres@anarazel.de 5346 : 362786 : refcount = GetPrivateRefCount(buffer);
5347 [ - + ]: 362786 : Assert(refcount);
5348 [ + + ]: 362786 : if (refcount != 1)
6051 tgl@sss.pgh.pa.us 5349 : 139 : return false;
5350 : :
5351 : : /* Try to acquire lock */
5352 [ + + ]: 362647 : if (!ConditionalLockBuffer(buffer))
5353 : 64 : return false;
5354 : :
3363 andres@anarazel.de 5355 : 362583 : bufHdr = GetBufferDescriptor(buffer - 1);
2926 5356 : 362583 : buf_state = LockBufHdr(bufHdr);
5357 : 362583 : refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5358 : :
5359 [ - + ]: 362583 : Assert(refcount > 0);
5360 [ + + ]: 362583 : if (refcount == 1)
5361 : : {
5362 : : /* Successfully acquired exclusive lock with pincount 1 */
5363 : 362524 : UnlockBufHdr(bufHdr, buf_state);
6051 tgl@sss.pgh.pa.us 5364 : 362524 : return true;
5365 : : }
5366 : :
5367 : : /* Failed, so release the lock */
2926 andres@anarazel.de 5368 : 59 : UnlockBufHdr(bufHdr, buf_state);
6051 tgl@sss.pgh.pa.us 5369 : 59 : LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5370 : 59 : return false;
5371 : : }
5372 : :
5373 : : /*
5374 : : * IsBufferCleanupOK - as above, but we already have the lock
5375 : : *
5376 : : * Check whether it's OK to perform cleanup on a buffer we've already
5377 : : * locked. If we observe that the pin count is 1, our exclusive lock
5378 : : * happens to be a cleanup lock, and we can proceed with anything that
5379 : : * would have been allowable had we sought a cleanup lock originally.
5380 : : */
5381 : : bool
2718 rhaas@postgresql.org 5382 : 2030 : IsBufferCleanupOK(Buffer buffer)
5383 : : {
5384 : : BufferDesc *bufHdr;
5385 : : uint32 buf_state;
5386 : :
5387 [ - + ]: 2030 : Assert(BufferIsValid(buffer));
5388 : :
5389 [ - + ]: 2030 : if (BufferIsLocal(buffer))
5390 : : {
5391 : : /* There should be exactly one pin */
2718 rhaas@postgresql.org 5392 [ # # ]:UBC 0 : if (LocalRefCount[-buffer - 1] != 1)
5393 : 0 : return false;
5394 : : /* Nobody else to wait for */
5395 : 0 : return true;
5396 : : }
5397 : :
5398 : : /* There should be exactly one local pin */
2718 rhaas@postgresql.org 5399 [ - + ]:CBC 2030 : if (GetPrivateRefCount(buffer) != 1)
2718 rhaas@postgresql.org 5400 :UBC 0 : return false;
5401 : :
2718 rhaas@postgresql.org 5402 :CBC 2030 : bufHdr = GetBufferDescriptor(buffer - 1);
5403 : :
5404 : : /* caller must hold exclusive lock on buffer */
5405 [ - + ]: 2030 : Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
5406 : : LW_EXCLUSIVE));
5407 : :
5408 : 2030 : buf_state = LockBufHdr(bufHdr);
5409 : :
5410 [ - + ]: 2030 : Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5411 [ + - ]: 2030 : if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5412 : : {
5413 : : /* pincount is OK. */
5414 : 2030 : UnlockBufHdr(bufHdr, buf_state);
5415 : 2030 : return true;
5416 : : }
5417 : :
2718 rhaas@postgresql.org 5418 :UBC 0 : UnlockBufHdr(bufHdr, buf_state);
5419 : 0 : return false;
5420 : : }
5421 : :
5422 : :
5423 : : /*
5424 : : * Functions for buffer I/O handling
5425 : : *
5426 : : * Note: We assume that nested buffer I/O never occurs.
5427 : : * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
5428 : : *
5429 : : * Also note that these are used only for shared buffers, not local ones.
5430 : : */
5431 : :
5432 : : /*
5433 : : * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
5434 : : */
5435 : : static void
3072 rhaas@postgresql.org 5436 :CBC 4409 : WaitIO(BufferDesc *buf)
5437 : : {
1130 tmunro@postgresql.or 5438 : 4409 : ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
5439 : :
5440 : 4409 : ConditionVariablePrepareToSleep(cv);
5441 : : for (;;)
6981 tgl@sss.pgh.pa.us 5442 : 4021 : {
5443 : : uint32 buf_state;
5444 : :
5445 : : /*
5446 : : * It may not be necessary to acquire the spinlock to check the flag
5447 : : * here, but since this test is essential for correctness, we'd better
5448 : : * play it safe.
5449 : : */
2926 andres@anarazel.de 5450 : 8430 : buf_state = LockBufHdr(buf);
5451 : 8430 : UnlockBufHdr(buf, buf_state);
5452 : :
5453 [ + + ]: 8430 : if (!(buf_state & BM_IO_IN_PROGRESS))
6981 tgl@sss.pgh.pa.us 5454 : 4409 : break;
1130 tmunro@postgresql.or 5455 : 4021 : ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5456 : : }
5457 : 4409 : ConditionVariableCancelSleep();
6981 tgl@sss.pgh.pa.us 5458 : 4409 : }
5459 : :
5460 : : /*
5461 : : * StartBufferIO: begin I/O on this buffer
5462 : : * (Assumptions)
5463 : : * My process is executing no IO
5464 : : * The buffer is Pinned
5465 : : *
5466 : : * In some scenarios there are race conditions in which multiple backends
5467 : : * could attempt the same I/O operation concurrently. If someone else
5468 : : * has already started I/O on this buffer then we will block on the
5469 : : * I/O condition variable until he's done.
5470 : : *
5471 : : * Input operations are only attempted on buffers that are not BM_VALID,
5472 : : * and output operations only on buffers that are BM_VALID and BM_DIRTY,
5473 : : * so we can always tell if the work is already done.
5474 : : *
5475 : : * Returns true if we successfully marked the buffer as I/O busy,
5476 : : * false if someone else already did the work.
5477 : : *
5478 : : * If nowait is true, then we don't wait for an I/O to be finished by another
5479 : : * backend. In that case, false indicates either that the I/O was already
5480 : : * finished, or is still in progress. This is useful for callers that want to
5481 : : * find out if they can perform the I/O as part of a larger operation, without
5482 : : * waiting for the answer or distinguishing the reasons why not.
5483 : : */
5484 : : static bool
11 tmunro@postgresql.or 5485 :GNC 1881120 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
5486 : : {
5487 : : uint32 buf_state;
5488 : :
158 heikki.linnakangas@i 5489 : 1881120 : ResourceOwnerEnlarge(CurrentResourceOwner);
5490 : :
5491 : : for (;;)
5492 : : {
2926 andres@anarazel.de 5493 :CBC 1885528 : buf_state = LockBufHdr(buf);
5494 : :
5495 [ + + ]: 1885528 : if (!(buf_state & BM_IO_IN_PROGRESS))
6981 tgl@sss.pgh.pa.us 5496 : 1881120 : break;
2926 andres@anarazel.de 5497 : 4408 : UnlockBufHdr(buf, buf_state);
11 tmunro@postgresql.or 5498 [ - + ]:GNC 4408 : if (nowait)
11 tmunro@postgresql.or 5499 :UNC 0 : return false;
6981 tgl@sss.pgh.pa.us 5500 :CBC 4408 : WaitIO(buf);
5501 : : }
5502 : :
5503 : : /* Once we get here, there is definitely no I/O active on this buffer */
5504 : :
2926 andres@anarazel.de 5505 [ + + + + ]: 1881120 : if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5506 : : {
5507 : : /* someone else already did the I/O */
5508 : 5935 : UnlockBufHdr(buf, buf_state);
6981 tgl@sss.pgh.pa.us 5509 : 5935 : return false;
5510 : : }
5511 : :
2926 andres@anarazel.de 5512 : 1875185 : buf_state |= BM_IO_IN_PROGRESS;
5513 : 1875185 : UnlockBufHdr(buf, buf_state);
5514 : :
375 5515 : 1875185 : ResourceOwnerRememberBufferIO(CurrentResourceOwner,
5516 : : BufferDescriptorGetBuffer(buf));
5517 : :
6981 tgl@sss.pgh.pa.us 5518 : 1875185 : return true;
5519 : : }
5520 : :
5521 : : /*
5522 : : * TerminateBufferIO: release a buffer we were doing I/O on
5523 : : * (Assumptions)
5524 : : * My process is executing IO for the buffer
5525 : : * BM_IO_IN_PROGRESS bit is set for the buffer
5526 : : * The buffer is Pinned
5527 : : *
5528 : : * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
5529 : : * buffer's BM_DIRTY flag. This is appropriate when terminating a
5530 : : * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
5531 : : * marking the buffer clean if it was re-dirtied while we were writing.
5532 : : *
5533 : : * set_flag_bits gets ORed into the buffer's flags. It must include
5534 : : * BM_IO_ERROR in a failure case. For successful completion it could
5535 : : * be 0, or BM_VALID if we just finished reading in the page.
5536 : : *
5537 : : * If forget_owner is true, we release the buffer I/O from the current
5538 : : * resource owner. (forget_owner=false is used when the resource owner itself
5539 : : * is being released)
5540 : : */
5541 : : static void
158 heikki.linnakangas@i 5542 :GNC 1875185 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
5543 : : bool forget_owner)
5544 : : {
5545 : : uint32 buf_state;
5546 : :
2926 andres@anarazel.de 5547 :CBC 1875185 : buf_state = LockBufHdr(buf);
5548 : :
5549 [ - + ]: 1875185 : Assert(buf_state & BM_IO_IN_PROGRESS);
5550 : :
5551 : 1875185 : buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
5552 [ + + + + ]: 1875185 : if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
5553 : 530328 : buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
5554 : :
5555 : 1875185 : buf_state |= set_flag_bits;
5556 : 1875185 : UnlockBufHdr(buf, buf_state);
5557 : :
158 heikki.linnakangas@i 5558 [ + + ]:GNC 1875185 : if (forget_owner)
5559 : 1875170 : ResourceOwnerForgetBufferIO(CurrentResourceOwner,
5560 : : BufferDescriptorGetBuffer(buf));
5561 : :
1130 tmunro@postgresql.or 5562 :CBC 1875185 : ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
8854 inoue@tpf.co.jp 5563 : 1875185 : }
5564 : :
5565 : : /*
5566 : : * AbortBufferIO: Clean up active buffer I/O after an error.
5567 : : *
5568 : : * All LWLocks we might have held have been released,
5569 : : * but we haven't yet released buffer pins, so the buffer is still pinned.
5570 : : *
5571 : : * If I/O was in progress, we always set BM_IO_ERROR, even though it's
5572 : : * possible the error condition wasn't related to the I/O.
5573 : : *
5574 : : * Note: this does not remove the buffer I/O from the resource owner.
5575 : : * That's correct when we're releasing the whole resource owner, but
5576 : : * beware if you use this in other contexts.
5577 : : */
5578 : : static void
367 pg@bowt.ie 5579 : 15 : AbortBufferIO(Buffer buffer)
5580 : : {
5581 : 15 : BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5582 : : uint32 buf_state;
5583 : :
375 andres@anarazel.de 5584 : 15 : buf_state = LockBufHdr(buf_hdr);
5585 [ - + ]: 15 : Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5586 : :
5587 [ + - ]: 15 : if (!(buf_state & BM_VALID))
5588 : : {
5589 [ - + ]: 15 : Assert(!(buf_state & BM_DIRTY));
5590 : 15 : UnlockBufHdr(buf_hdr, buf_state);
5591 : : }
5592 : : else
5593 : : {
373 andres@anarazel.de 5594 [ # # ]:UBC 0 : Assert(buf_state & BM_DIRTY);
375 5595 : 0 : UnlockBufHdr(buf_hdr, buf_state);
5596 : :
5597 : : /* Issue notice if this is not the first failure... */
5598 [ # # ]: 0 : if (buf_state & BM_IO_ERROR)
5599 : : {
5600 : : /* Buffer is pinned, so we can read tag without spinlock */
5601 : : char *path;
5602 : :
5603 : 0 : path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5604 : : BufTagGetForkNum(&buf_hdr->tag));
5605 [ # # ]: 0 : ereport(WARNING,
5606 : : (errcode(ERRCODE_IO_ERROR),
5607 : : errmsg("could not write block %u of %s",
5608 : : buf_hdr->tag.blockNum, path),
5609 : : errdetail("Multiple failures --- write error might be permanent.")));
5610 : 0 : pfree(path);
5611 : : }
5612 : : }
5613 : :
158 heikki.linnakangas@i 5614 :GNC 15 : TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
8854 inoue@tpf.co.jp 5615 :CBC 15 : }
5616 : :
5617 : : /*
5618 : : * Error context callback for errors occurring during shared buffer writes.
5619 : : */
5620 : : static void
4993 rhaas@postgresql.org 5621 : 41 : shared_buffer_write_error_callback(void *arg)
5622 : : {
3072 5623 : 41 : BufferDesc *bufHdr = (BufferDesc *) arg;
5624 : :
5625 : : /* Buffer is pinned, so we can read the tag without locking the spinlock */
7645 tgl@sss.pgh.pa.us 5626 [ + - ]: 41 : if (bufHdr != NULL)
5627 : : {
599 rhaas@postgresql.org 5628 : 41 : char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
5629 : : BufTagGetForkNum(&bufHdr->tag));
5630 : :
4993 5631 : 41 : errcontext("writing block %u of relation %s",
5632 : : bufHdr->tag.blockNum, path);
5633 : 41 : pfree(path);
5634 : : }
5635 : 41 : }
5636 : :
5637 : : /*
5638 : : * Error context callback for errors occurring during local buffer writes.
5639 : : */
5640 : : static void
4993 rhaas@postgresql.org 5641 :UBC 0 : local_buffer_write_error_callback(void *arg)
5642 : : {
3072 5643 : 0 : BufferDesc *bufHdr = (BufferDesc *) arg;
5644 : :
4993 5645 [ # # ]: 0 : if (bufHdr != NULL)
5646 : : {
599 5647 : 0 : char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5648 : : MyProcNumber,
5649 : : BufTagGetForkNum(&bufHdr->tag));
5650 : :
5633 heikki.linnakangas@i 5651 : 0 : errcontext("writing block %u of relation %s",
5652 : : bufHdr->tag.blockNum, path);
5653 : 0 : pfree(path);
5654 : : }
7645 tgl@sss.pgh.pa.us 5655 : 0 : }
5656 : :
5657 : : /*
5658 : : * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
5659 : : */
5660 : : static int
648 rhaas@postgresql.org 5661 :CBC 9984884 : rlocator_comparator(const void *p1, const void *p2)
5662 : : {
5663 : 9984884 : RelFileLocator n1 = *(const RelFileLocator *) p1;
5664 : 9984884 : RelFileLocator n2 = *(const RelFileLocator *) p2;
5665 : :
5666 [ + + ]: 9984884 : if (n1.relNumber < n2.relNumber)
4105 alvherre@alvh.no-ip. 5667 : 9155607 : return -1;
648 rhaas@postgresql.org 5668 [ + + ]: 829277 : else if (n1.relNumber > n2.relNumber)
4105 alvherre@alvh.no-ip. 5669 : 174163 : return 1;
5670 : :
648 rhaas@postgresql.org 5671 [ + + ]: 655114 : if (n1.dbOid < n2.dbOid)
4105 alvherre@alvh.no-ip. 5672 : 55323 : return -1;
648 rhaas@postgresql.org 5673 [ + + ]: 599791 : else if (n1.dbOid > n2.dbOid)
4105 alvherre@alvh.no-ip. 5674 : 68490 : return 1;
5675 : :
648 rhaas@postgresql.org 5676 [ - + ]: 531301 : if (n1.spcOid < n2.spcOid)
4105 alvherre@alvh.no-ip. 5677 :UBC 0 : return -1;
648 rhaas@postgresql.org 5678 [ - + ]:CBC 531301 : else if (n1.spcOid > n2.spcOid)
4105 alvherre@alvh.no-ip. 5679 :UBC 0 : return 1;
5680 : : else
4105 alvherre@alvh.no-ip. 5681 :CBC 531301 : return 0;
5682 : : }
5683 : :
5684 : : /*
5685 : : * Lock buffer header - set BM_LOCKED in buffer state.
5686 : : */
5687 : : uint32
2926 andres@anarazel.de 5688 : 26884984 : LockBufHdr(BufferDesc *desc)
5689 : : {
5690 : : SpinDelayStatus delayStatus;
5691 : : uint32 old_buf_state;
5692 : :
375 5693 [ - + ]: 26884984 : Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
5694 : :
2922 5695 : 26884984 : init_local_spin_delay(&delayStatus);
5696 : :
5697 : : while (true)
5698 : : {
5699 : : /* set BM_LOCKED flag */
2926 5700 : 26910677 : old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5701 : : /* if it wasn't set before we're OK */
5702 [ + + ]: 26910677 : if (!(old_buf_state & BM_LOCKED))
5703 : 26884984 : break;
5704 : 25693 : perform_spin_delay(&delayStatus);
5705 : : }
5706 : 26884984 : finish_spin_delay(&delayStatus);
5707 : 26884984 : return old_buf_state | BM_LOCKED;
5708 : : }
5709 : :
5710 : : /*
5711 : : * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
5712 : : * state at that point.
5713 : : *
5714 : : * Obviously the buffer could be locked by the time the value is returned, so
5715 : : * this is primarily useful in CAS style loops.
5716 : : */
5717 : : static uint32
5718 : 2134 : WaitBufHdrUnlocked(BufferDesc *buf)
5719 : : {
5720 : : SpinDelayStatus delayStatus;
5721 : : uint32 buf_state;
5722 : :
2922 5723 : 2134 : init_local_spin_delay(&delayStatus);
5724 : :
2926 5725 : 2134 : buf_state = pg_atomic_read_u32(&buf->state);
5726 : :
5727 [ + + ]: 18387 : while (buf_state & BM_LOCKED)
5728 : : {
5729 : 16253 : perform_spin_delay(&delayStatus);
5730 : 16253 : buf_state = pg_atomic_read_u32(&buf->state);
5731 : : }
5732 : :
5733 : 2134 : finish_spin_delay(&delayStatus);
5734 : :
5735 : 2134 : return buf_state;
5736 : : }
5737 : :
5738 : : /*
5739 : : * BufferTag comparator.
5740 : : */
5741 : : static inline int
1129 tmunro@postgresql.or 5742 : 934672 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
5743 : : {
5744 : : int ret;
5745 : : RelFileLocator rlocatora;
5746 : : RelFileLocator rlocatorb;
5747 : :
599 rhaas@postgresql.org 5748 : 934672 : rlocatora = BufTagGetRelFileLocator(ba);
5749 : 934672 : rlocatorb = BufTagGetRelFileLocator(bb);
5750 : :
5751 : 934672 : ret = rlocator_comparator(&rlocatora, &rlocatorb);
5752 : :
2977 andres@anarazel.de 5753 [ + + ]: 934672 : if (ret != 0)
5754 : 405030 : return ret;
5755 : :
599 rhaas@postgresql.org 5756 [ + + ]: 529642 : if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
2977 andres@anarazel.de 5757 : 30700 : return -1;
599 rhaas@postgresql.org 5758 [ + + ]: 498942 : if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
2977 andres@anarazel.de 5759 : 19368 : return 1;
5760 : :
5761 [ + + ]: 479574 : if (ba->blockNum < bb->blockNum)
5762 : 314257 : return -1;
5763 [ + + ]: 165317 : if (ba->blockNum > bb->blockNum)
5764 : 163952 : return 1;
5765 : :
5766 : 1365 : return 0;
5767 : : }
5768 : :
5769 : : /*
5770 : : * Comparator determining the writeout order in a checkpoint.
5771 : : *
5772 : : * It is important that tablespaces are compared first, the logic balancing
5773 : : * writes between tablespaces relies on it.
5774 : : */
5775 : : static inline int
1129 tmunro@postgresql.or 5776 : 2549484 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
5777 : : {
5778 : : /* compare tablespace */
2977 andres@anarazel.de 5779 [ + + ]: 2549484 : if (a->tsId < b->tsId)
5780 : 4006 : return -1;
5781 [ + + ]: 2545478 : else if (a->tsId > b->tsId)
5782 : 15564 : return 1;
5783 : : /* compare relation */
648 rhaas@postgresql.org 5784 [ + + ]: 2529914 : if (a->relNumber < b->relNumber)
2977 andres@anarazel.de 5785 : 714848 : return -1;
648 rhaas@postgresql.org 5786 [ + + ]: 1815066 : else if (a->relNumber > b->relNumber)
2977 andres@anarazel.de 5787 : 694620 : return 1;
5788 : : /* compare fork */
5789 [ + + ]: 1120446 : else if (a->forkNum < b->forkNum)
5790 : 50932 : return -1;
5791 [ + + ]: 1069514 : else if (a->forkNum > b->forkNum)
5792 : 52000 : return 1;
5793 : : /* compare block number */
5794 [ + + ]: 1017514 : else if (a->blockNum < b->blockNum)
5795 : 492826 : return -1;
2286 tgl@sss.pgh.pa.us 5796 [ + + ]: 524688 : else if (a->blockNum > b->blockNum)
2977 andres@anarazel.de 5797 : 479915 : return 1;
5798 : : /* equal page IDs are unlikely, but not impossible */
2286 tgl@sss.pgh.pa.us 5799 : 44773 : return 0;
5800 : : }
5801 : :
5802 : : /*
5803 : : * Comparator for a Min-Heap over the per-tablespace checkpoint completion
5804 : : * progress.
5805 : : */
5806 : : static int
2977 andres@anarazel.de 5807 : 204417 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
5808 : : {
5809 : 204417 : CkptTsStatus *sa = (CkptTsStatus *) a;
5810 : 204417 : CkptTsStatus *sb = (CkptTsStatus *) b;
5811 : :
5812 : : /* we want a min-heap, so return 1 for the a < b */
5813 [ + + ]: 204417 : if (sa->progress < sb->progress)
5814 : 197434 : return 1;
5815 [ + + ]: 6983 : else if (sa->progress == sb->progress)
5816 : 501 : return 0;
5817 : : else
5818 : 6482 : return -1;
5819 : : }
5820 : :
5821 : : /*
5822 : : * Initialize a writeback context, discarding potential previous state.
5823 : : *
5824 : : * *max_pending is a pointer instead of an immediate value, so the coalesce
5825 : : * limits can easily changed by the GUC mechanism, and so calling code does
5826 : : * not have to check the current configuration. A value of 0 means that no
5827 : : * writeback control will be performed.
5828 : : */
5829 : : void
5830 : 2388 : WritebackContextInit(WritebackContext *context, int *max_pending)
5831 : : {
5832 [ - + ]: 2388 : Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5833 : :
5834 : 2388 : context->max_pending = max_pending;
5835 : 2388 : context->nr_pending = 0;
5836 : 2388 : }
5837 : :
5838 : : /*
5839 : : * Add buffer to list of pending writeback requests.
5840 : : */
5841 : : void
333 5842 : 526491 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
5843 : : BufferTag *tag)
5844 : : {
5845 : : PendingWriteback *pending;
5846 : :
372 tmunro@postgresql.or 5847 [ + + ]: 526491 : if (io_direct_flags & IO_DIRECT_DATA)
5848 : 553 : return;
5849 : :
5850 : : /*
5851 : : * Add buffer to the pending writeback array, unless writeback control is
5852 : : * disabled.
5853 : : */
333 andres@anarazel.de 5854 [ + + ]: 525938 : if (*wb_context->max_pending > 0)
5855 : : {
5856 [ - + ]: 274213 : Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
5857 : :
5858 : 274213 : pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
5859 : :
2977 5860 : 274213 : pending->tag = *tag;
5861 : : }
5862 : :
5863 : : /*
5864 : : * Perform pending flushes if the writeback limit is exceeded. This
5865 : : * includes the case where previously an item has been added, but control
5866 : : * is now disabled.
5867 : : */
333 5868 [ + + ]: 525938 : if (wb_context->nr_pending >= *wb_context->max_pending)
5869 : 259451 : IssuePendingWritebacks(wb_context, io_context);
5870 : : }
5871 : :
5872 : : #define ST_SORT sort_pending_writebacks
5873 : : #define ST_ELEMENT_TYPE PendingWriteback
5874 : : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
5875 : : #define ST_SCOPE static
5876 : : #define ST_DEFINE
5877 : : #include <lib/sort_template.h>
5878 : :
5879 : : /*
5880 : : * Issue all pending writeback requests, previously scheduled with
5881 : : * ScheduleBufferTagForWriteback, to the OS.
5882 : : *
5883 : : * Because this is only used to improve the OSs IO scheduling we try to never
5884 : : * error out - it's just a hint.
5885 : : */
5886 : : void
5887 : 260200 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
5888 : : {
5889 : : instr_time io_start;
5890 : : int i;
5891 : :
5892 [ + + ]: 260200 : if (wb_context->nr_pending == 0)
2977 5893 : 251779 : return;
5894 : :
5895 : : /*
5896 : : * Executing the writes in-order can make them a lot faster, and allows to
5897 : : * merge writeback requests to consecutive blocks into larger writebacks.
5898 : : */
333 5899 : 8421 : sort_pending_writebacks(wb_context->pending_writebacks,
5900 : 8421 : wb_context->nr_pending);
5901 : :
120 michael@paquier.xyz 5902 :GNC 8421 : io_start = pgstat_prepare_io_time(track_io_timing);
5903 : :
5904 : : /*
5905 : : * Coalesce neighbouring writes, but nothing else. For that we iterate
5906 : : * through the, now sorted, array of pending flushes, and look forward to
5907 : : * find all neighbouring (or identical) writes.
5908 : : */
333 andres@anarazel.de 5909 [ + + ]:CBC 94076 : for (i = 0; i < wb_context->nr_pending; i++)
5910 : : {
5911 : : PendingWriteback *cur;
5912 : : PendingWriteback *next;
5913 : : SMgrRelation reln;
5914 : : int ahead;
5915 : : BufferTag tag;
5916 : : RelFileLocator currlocator;
2977 5917 : 85655 : Size nblocks = 1;
5918 : :
333 5919 : 85655 : cur = &wb_context->pending_writebacks[i];
2977 5920 : 85655 : tag = cur->tag;
599 rhaas@postgresql.org 5921 : 85655 : currlocator = BufTagGetRelFileLocator(&tag);
5922 : :
5923 : : /*
5924 : : * Peek ahead, into following writeback requests, to see if they can
5925 : : * be combined with the current one.
5926 : : */
333 andres@anarazel.de 5927 [ + + ]: 270987 : for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5928 : : {
5929 : :
5930 : 262566 : next = &wb_context->pending_writebacks[i + ahead + 1];
5931 : :
5932 : : /* different file, stop */
599 rhaas@postgresql.org 5933 [ + + + + : 262566 : if (!RelFileLocatorEquals(currlocator,
+ - ]
5934 [ + + ]: 208416 : BufTagGetRelFileLocator(&next->tag)) ||
5935 : 208416 : BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5936 : : break;
5937 : :
5938 : : /* ok, block queued twice, skip */
2977 andres@anarazel.de 5939 [ + + ]: 192351 : if (cur->tag.blockNum == next->tag.blockNum)
5940 : 1192 : continue;
5941 : :
5942 : : /* only merge consecutive writes */
5943 [ + + ]: 191159 : if (cur->tag.blockNum + 1 != next->tag.blockNum)
5944 : 7019 : break;
5945 : :
5946 : 184140 : nblocks++;
5947 : 184140 : cur = next;
5948 : : }
5949 : :
5950 : 85655 : i += ahead;
5951 : :
5952 : : /* and finally tell the kernel to write the data to storage */
42 heikki.linnakangas@i 5953 :GNC 85655 : reln = smgropen(currlocator, INVALID_PROC_NUMBER);
599 rhaas@postgresql.org 5954 :CBC 85655 : smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
5955 : : }
5956 : :
5957 : : /*
5958 : : * Assume that writeback requests are only issued for buffers containing
5959 : : * blocks of permanent relations.
5960 : : */
333 andres@anarazel.de 5961 : 8421 : pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
5962 : 8421 : IOOP_WRITEBACK, io_start, wb_context->nr_pending);
5963 : :
5964 : 8421 : wb_context->nr_pending = 0;
5965 : : }
5966 : :
5967 : : /* ResourceOwner callbacks */
5968 : :
5969 : : static void
158 heikki.linnakangas@i 5970 :GNC 15 : ResOwnerReleaseBufferIO(Datum res)
5971 : : {
5972 : 15 : Buffer buffer = DatumGetInt32(res);
5973 : :
5974 : 15 : AbortBufferIO(buffer);
5975 : 15 : }
5976 : :
5977 : : static char *
158 heikki.linnakangas@i 5978 :UNC 0 : ResOwnerPrintBufferIO(Datum res)
5979 : : {
5980 : 0 : Buffer buffer = DatumGetInt32(res);
5981 : :
5982 : 0 : return psprintf("lost track of buffer IO on buffer %d", buffer);
5983 : : }
5984 : :
5985 : : static void
158 heikki.linnakangas@i 5986 :GNC 4128 : ResOwnerReleaseBufferPin(Datum res)
5987 : : {
5988 : 4128 : Buffer buffer = DatumGetInt32(res);
5989 : :
5990 : : /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
5991 [ - + ]: 4128 : if (!BufferIsValid(buffer))
158 heikki.linnakangas@i 5992 [ # # ]:UNC 0 : elog(ERROR, "bad buffer ID: %d", buffer);
5993 : :
158 heikki.linnakangas@i 5994 [ + + ]:GNC 4128 : if (BufferIsLocal(buffer))
5995 : 377 : UnpinLocalBufferNoOwner(buffer);
5996 : : else
5997 : 3751 : UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
5998 : 4128 : }
5999 : :
6000 : : static char *
158 heikki.linnakangas@i 6001 :UNC 0 : ResOwnerPrintBufferPin(Datum res)
6002 : : {
6003 : 0 : return DebugPrintBufferRefcount(DatumGetInt32(res));
6004 : : }
6005 : :
6006 : : /*
6007 : : * Try to evict the current block in a shared buffer.
6008 : : *
6009 : : * This function is intended for testing/development use only!
6010 : : *
6011 : : * To succeed, the buffer must not be pinned on entry, so if the caller had a
6012 : : * particular block in mind, it might already have been replaced by some other
6013 : : * block by the time this function runs. It's also unpinned on return, so the
6014 : : * buffer might be occupied again by the time control is returned, potentially
6015 : : * even by the same block. This inherent raciness without other interlocking
6016 : : * makes the function unsuitable for non-testing usage.
6017 : : *
6018 : : * Returns true if the buffer was valid and it has now been made invalid.
6019 : : * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
6020 : : * or if the buffer becomes dirty again while we're trying to write it out.
6021 : : */
6022 : : bool
7 tmunro@postgresql.or 6023 : 0 : EvictUnpinnedBuffer(Buffer buf)
6024 : : {
6025 : : BufferDesc *desc;
6026 : : uint32 buf_state;
6027 : : bool result;
6028 : :
6029 : : /* Make sure we can pin the buffer. */
6030 : 0 : ResourceOwnerEnlarge(CurrentResourceOwner);
6031 : 0 : ReservePrivateRefCountEntry();
6032 : :
6033 [ # # ]: 0 : Assert(!BufferIsLocal(buf));
6034 : 0 : desc = GetBufferDescriptor(buf - 1);
6035 : :
6036 : : /* Lock the header and check if it's valid. */
6037 : 0 : buf_state = LockBufHdr(desc);
6038 [ # # ]: 0 : if ((buf_state & BM_VALID) == 0)
6039 : : {
6040 : 0 : UnlockBufHdr(desc, buf_state);
6041 : 0 : return false;
6042 : : }
6043 : :
6044 : : /* Check that it's not pinned already. */
6045 [ # # ]: 0 : if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6046 : : {
6047 : 0 : UnlockBufHdr(desc, buf_state);
6048 : 0 : return false;
6049 : : }
6050 : :
6051 : 0 : PinBuffer_Locked(desc); /* releases spinlock */
6052 : :
6053 : : /* If it was dirty, try to clean it once. */
6054 [ # # ]: 0 : if (buf_state & BM_DIRTY)
6055 : : {
6056 : 0 : LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED);
6057 : 0 : FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
6058 : 0 : LWLockRelease(BufferDescriptorGetContentLock(desc));
6059 : : }
6060 : :
6061 : : /* This will return false if it becomes dirty or someone else pins it. */
6062 : 0 : result = InvalidateVictimBuffer(desc);
6063 : :
6064 : 0 : UnpinBuffer(desc);
6065 : :
6066 : 0 : return result;
7 tmunro@postgresql.or 6067 :ECB (648) : }
|