LCOV - differential code coverage report
Current view: top level - src/backend/storage/buffer - bufmgr.c (source / functions) Coverage Total Hit UNC LBC UBC GBC GNC CBC EUB ECB DUB DCB
Current: Differential Code Coverage 16@8cea358b128 vs 17@8cea358b128 Lines: 91.1 % 1647 1501 56 90 44 224 1233 1 1 16 119
Current Date: 2024-04-14 14:21:10 Functions: 94.7 % 94 89 4 1 1 42 46 1 4
Baseline: 16@8cea358b128 Branches: 73.1 % 1162 850 57 4 251 25 115 710
Baseline Date: 2024-04-14 14:21:09 Line coverage date bins:
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed [..60] days: 81.5 % 200 163 36 1 159 4 1
(60,120] days: 100.0 % 10 10 10
(120,180] days: 71.2 % 66 47 19 47
(180,240] days: 100.0 % 41 41 3 38
(240..) days: 93.2 % 1330 1240 1 89 44 5 1191 1
Function coverage date bins:
[..60] days: 90.0 % 10 9 1 9
(120,180] days: 66.7 % 9 6 3 6
(180,240] days: 100.0 % 5 5 2 3
(240..) days: 98.6 % 70 69 1 1 25 43
Branch coverage date bins:
[..60] days: 69.6 % 138 96 40 2 90 6
(60,120] days: 100.0 % 2 2 2
(120,180] days: 46.9 % 32 15 17 15
(180,240] days: 77.8 % 36 28 3 5 6 22
(240..) days: 74.3 % 954 709 1 244 25 2 682

 Age         Owner                    Branch data    TLA  Line data    Source code
                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * bufmgr.c
                                  4                 :                :  *    buffer manager interface routines
                                  5                 :                :  *
                                  6                 :                :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
                                  7                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                  8                 :                :  *
                                  9                 :                :  *
                                 10                 :                :  * IDENTIFICATION
                                 11                 :                :  *    src/backend/storage/buffer/bufmgr.c
                                 12                 :                :  *
                                 13                 :                :  *-------------------------------------------------------------------------
                                 14                 :                :  */
                                 15                 :                : /*
                                 16                 :                :  * Principal entry points:
                                 17                 :                :  *
                                 18                 :                :  * ReadBuffer() -- find or create a buffer holding the requested page,
                                 19                 :                :  *      and pin it so that no one can destroy it while this process
                                 20                 :                :  *      is using it.
                                 21                 :                :  *
                                 22                 :                :  * StartReadBuffer() -- as above, with separate wait step
                                 23                 :                :  * StartReadBuffers() -- multiple block version
                                 24                 :                :  * WaitReadBuffers() -- second step of above
                                 25                 :                :  *
                                 26                 :                :  * ReleaseBuffer() -- unpin a buffer
                                 27                 :                :  *
                                 28                 :                :  * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
                                 29                 :                :  *      The disk write is delayed until buffer replacement or checkpoint.
                                 30                 :                :  *
                                 31                 :                :  * See also these files:
                                 32                 :                :  *      freelist.c -- chooses victim for buffer replacement
                                 33                 :                :  *      buf_table.c -- manages the buffer lookup table
                                 34                 :                :  */
                                 35                 :                : #include "postgres.h"
                                 36                 :                : 
                                 37                 :                : #include <sys/file.h>
                                 38                 :                : #include <unistd.h>
                                 39                 :                : 
                                 40                 :                : #include "access/tableam.h"
                                 41                 :                : #include "access/xloginsert.h"
                                 42                 :                : #include "access/xlogutils.h"
                                 43                 :                : #include "catalog/storage.h"
                                 44                 :                : #include "catalog/storage_xlog.h"
                                 45                 :                : #include "executor/instrument.h"
                                 46                 :                : #include "lib/binaryheap.h"
                                 47                 :                : #include "miscadmin.h"
                                 48                 :                : #include "pg_trace.h"
                                 49                 :                : #include "pgstat.h"
                                 50                 :                : #include "postmaster/bgwriter.h"
                                 51                 :                : #include "storage/buf_internals.h"
                                 52                 :                : #include "storage/bufmgr.h"
                                 53                 :                : #include "storage/fd.h"
                                 54                 :                : #include "storage/ipc.h"
                                 55                 :                : #include "storage/lmgr.h"
                                 56                 :                : #include "storage/proc.h"
                                 57                 :                : #include "storage/smgr.h"
                                 58                 :                : #include "storage/standby.h"
                                 59                 :                : #include "utils/memdebug.h"
                                 60                 :                : #include "utils/ps_status.h"
                                 61                 :                : #include "utils/rel.h"
                                 62                 :                : #include "utils/resowner.h"
                                 63                 :                : #include "utils/timestamp.h"
                                 64                 :                : 
                                 65                 :                : 
                                 66                 :                : /* Note: these two macros only work on shared buffers, not local ones! */
                                 67                 :                : #define BufHdrGetBlock(bufHdr)  ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
                                 68                 :                : #define BufferGetLSN(bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))
                                 69                 :                : 
                                 70                 :                : /* Note: this macro only works on local buffers, not shared ones! */
                                 71                 :                : #define LocalBufHdrGetBlock(bufHdr) \
                                 72                 :                :     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
                                 73                 :                : 
                                 74                 :                : /* Bits in SyncOneBuffer's return value */
                                 75                 :                : #define BUF_WRITTEN             0x01
                                 76                 :                : #define BUF_REUSABLE            0x02
                                 77                 :                : 
                                 78                 :                : #define RELS_BSEARCH_THRESHOLD      20
                                 79                 :                : 
                                 80                 :                : /*
                                 81                 :                :  * This is the size (in the number of blocks) above which we scan the
                                 82                 :                :  * entire buffer pool to remove the buffers for all the pages of relation
                                 83                 :                :  * being dropped. For the relations with size below this threshold, we find
                                 84                 :                :  * the buffers by doing lookups in BufMapping table.
                                 85                 :                :  */
                                 86                 :                : #define BUF_DROP_FULL_SCAN_THRESHOLD        (uint64) (NBuffers / 32)
                                 87                 :                : 
                                 88                 :                : typedef struct PrivateRefCountEntry
                                 89                 :                : {
                                 90                 :                :     Buffer      buffer;
                                 91                 :                :     int32       refcount;
                                 92                 :                : } PrivateRefCountEntry;
                                 93                 :                : 
                                 94                 :                : /* 64 bytes, about the size of a cache line on common systems */
                                 95                 :                : #define REFCOUNT_ARRAY_ENTRIES 8
                                 96                 :                : 
                                 97                 :                : /*
                                 98                 :                :  * Status of buffers to checkpoint for a particular tablespace, used
                                 99                 :                :  * internally in BufferSync.
                                100                 :                :  */
                                101                 :                : typedef struct CkptTsStatus
                                102                 :                : {
                                103                 :                :     /* oid of the tablespace */
                                104                 :                :     Oid         tsId;
                                105                 :                : 
                                106                 :                :     /*
                                107                 :                :      * Checkpoint progress for this tablespace. To make progress comparable
                                108                 :                :      * between tablespaces the progress is, for each tablespace, measured as a
                                109                 :                :      * number between 0 and the total number of to-be-checkpointed pages. Each
                                110                 :                :      * page checkpointed in this tablespace increments this space's progress
                                111                 :                :      * by progress_slice.
                                112                 :                :      */
                                113                 :                :     float8      progress;
                                114                 :                :     float8      progress_slice;
                                115                 :                : 
                                116                 :                :     /* number of to-be checkpointed pages in this tablespace */
                                117                 :                :     int         num_to_scan;
                                118                 :                :     /* already processed pages in this tablespace */
                                119                 :                :     int         num_scanned;
                                120                 :                : 
                                121                 :                :     /* current offset in CkptBufferIds for this tablespace */
                                122                 :                :     int         index;
                                123                 :                : } CkptTsStatus;
                                124                 :                : 
                                125                 :                : /*
                                126                 :                :  * Type for array used to sort SMgrRelations
                                127                 :                :  *
                                128                 :                :  * FlushRelationsAllBuffers shares the same comparator function with
                                129                 :                :  * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
                                130                 :                :  * compatible.
                                131                 :                :  */
                                132                 :                : typedef struct SMgrSortArray
                                133                 :                : {
                                134                 :                :     RelFileLocator rlocator;    /* This must be the first member */
                                135                 :                :     SMgrRelation srel;
                                136                 :                : } SMgrSortArray;
                                137                 :                : 
                                138                 :                : /* GUC variables */
                                139                 :                : bool        zero_damaged_pages = false;
                                140                 :                : int         bgwriter_lru_maxpages = 100;
                                141                 :                : double      bgwriter_lru_multiplier = 2.0;
                                142                 :                : bool        track_io_timing = false;
                                143                 :                : 
                                144                 :                : /*
                                145                 :                :  * How many buffers PrefetchBuffer callers should try to stay ahead of their
                                146                 :                :  * ReadBuffer calls by.  Zero means "never prefetch".  This value is only used
                                147                 :                :  * for buffers not belonging to tablespaces that have their
                                148                 :                :  * effective_io_concurrency parameter set.
                                149                 :                :  */
                                150                 :                : int         effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
                                151                 :                : 
                                152                 :                : /*
                                153                 :                :  * Like effective_io_concurrency, but used by maintenance code paths that might
                                154                 :                :  * benefit from a higher setting because they work on behalf of many sessions.
                                155                 :                :  * Overridden by the tablespace setting of the same name.
                                156                 :                :  */
                                157                 :                : int         maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
                                158                 :                : 
                                159                 :                : /*
                                160                 :                :  * Limit on how many blocks should be handled in single I/O operations.
                                161                 :                :  * StartReadBuffers() callers should respect it, as should other operations
                                162                 :                :  * that call smgr APIs directly.
                                163                 :                :  */
                                164                 :                : int         io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
                                165                 :                : 
                                166                 :                : /*
                                167                 :                :  * GUC variables about triggering kernel writeback for buffers written; OS
                                168                 :                :  * dependent defaults are set via the GUC mechanism.
                                169                 :                :  */
                                170                 :                : int         checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
                                171                 :                : int         bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
                                172                 :                : int         backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
                                173                 :                : 
                                174                 :                : /* local state for LockBufferForCleanup */
                                175                 :                : static BufferDesc *PinCountWaitBuf = NULL;
                                176                 :                : 
                                177                 :                : /*
                                178                 :                :  * Backend-Private refcount management:
                                179                 :                :  *
                                180                 :                :  * Each buffer also has a private refcount that keeps track of the number of
                                181                 :                :  * times the buffer is pinned in the current process.  This is so that the
                                182                 :                :  * shared refcount needs to be modified only once if a buffer is pinned more
                                183                 :                :  * than once by an individual backend.  It's also used to check that no buffers
                                184                 :                :  * are still pinned at the end of transactions and when exiting.
                                185                 :                :  *
                                186                 :                :  *
                                187                 :                :  * To avoid - as we used to - requiring an array with NBuffers entries to keep
                                188                 :                :  * track of local buffers, we use a small sequentially searched array
                                189                 :                :  * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
                                190                 :                :  * keep track of backend local pins.
                                191                 :                :  *
                                192                 :                :  * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
                                193                 :                :  * refcounts are kept track of in the array; after that, new array entries
                                194                 :                :  * displace old ones into the hash table. That way a frequently used entry
                                195                 :                :  * can't get "stuck" in the hashtable while infrequent ones clog the array.
                                196                 :                :  *
                                197                 :                :  * Note that in most scenarios the number of pinned buffers will not exceed
                                198                 :                :  * REFCOUNT_ARRAY_ENTRIES.
                                199                 :                :  *
                                200                 :                :  *
                                201                 :                :  * To enter a buffer into the refcount tracking mechanism first reserve a free
                                202                 :                :  * entry using ReservePrivateRefCountEntry() and then later, if necessary,
                                203                 :                :  * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
                                204                 :                :  * memory allocations in NewPrivateRefCountEntry() which can be important
                                205                 :                :  * because in some scenarios it's called with a spinlock held...
                                206                 :                :  */
                                207                 :                : static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
                                208                 :                : static HTAB *PrivateRefCountHash = NULL;
                                209                 :                : static int32 PrivateRefCountOverflowed = 0;
                                210                 :                : static uint32 PrivateRefCountClock = 0;
                                211                 :                : static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
                                212                 :                : 
                                213                 :                : static void ReservePrivateRefCountEntry(void);
                                214                 :                : static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
                                215                 :                : static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
                                216                 :                : static inline int32 GetPrivateRefCount(Buffer buffer);
                                217                 :                : static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
                                218                 :                : 
                                219                 :                : /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */
                                220                 :                : static void ResOwnerReleaseBufferIO(Datum res);
                                221                 :                : static char *ResOwnerPrintBufferIO(Datum res);
                                222                 :                : static void ResOwnerReleaseBufferPin(Datum res);
                                223                 :                : static char *ResOwnerPrintBufferPin(Datum res);
                                224                 :                : 
                                225                 :                : const ResourceOwnerDesc buffer_io_resowner_desc =
                                226                 :                : {
                                227                 :                :     .name = "buffer io",
                                228                 :                :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
                                229                 :                :     .release_priority = RELEASE_PRIO_BUFFER_IOS,
                                230                 :                :     .ReleaseResource = ResOwnerReleaseBufferIO,
                                231                 :                :     .DebugPrint = ResOwnerPrintBufferIO
                                232                 :                : };
                                233                 :                : 
                                234                 :                : const ResourceOwnerDesc buffer_pin_resowner_desc =
                                235                 :                : {
                                236                 :                :     .name = "buffer pin",
                                237                 :                :     .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
                                238                 :                :     .release_priority = RELEASE_PRIO_BUFFER_PINS,
                                239                 :                :     .ReleaseResource = ResOwnerReleaseBufferPin,
                                240                 :                :     .DebugPrint = ResOwnerPrintBufferPin
                                241                 :                : };
                                242                 :                : 
                                243                 :                : /*
                                244                 :                :  * Ensure that the PrivateRefCountArray has sufficient space to store one more
                                245                 :                :  * entry. This has to be called before using NewPrivateRefCountEntry() to fill
                                246                 :                :  * a new entry - but it's perfectly fine to not use a reserved entry.
                                247                 :                :  */
                                248                 :                : static void
 3373 andres@anarazel.de        249                 :CBC    54826905 : ReservePrivateRefCountEntry(void)
                                250                 :                : {
                                251                 :                :     /* Already reserved (or freed), nothing to do */
                                252         [ +  + ]:       54826905 :     if (ReservedRefCountEntry != NULL)
                                253                 :       51196689 :         return;
                                254                 :                : 
                                255                 :                :     /*
                                256                 :                :      * First search for a free entry the array, that'll be sufficient in the
                                257                 :                :      * majority of cases.
                                258                 :                :      */
                                259                 :                :     {
                                260                 :                :         int         i;
                                261                 :                : 
                                262         [ +  + ]:        8711428 :         for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                                263                 :                :         {
                                264                 :                :             PrivateRefCountEntry *res;
                                265                 :                : 
                                266                 :        8627384 :             res = &PrivateRefCountArray[i];
                                267                 :                : 
                                268         [ +  + ]:        8627384 :             if (res->buffer == InvalidBuffer)
                                269                 :                :             {
                                270                 :        3546172 :                 ReservedRefCountEntry = res;
                                271                 :        3546172 :                 return;
                                272                 :                :             }
                                273                 :                :         }
                                274                 :                :     }
                                275                 :                : 
                                276                 :                :     /*
                                277                 :                :      * No luck. All array entries are full. Move one array entry into the hash
                                278                 :                :      * table.
                                279                 :                :      */
                                280                 :                :     {
                                281                 :                :         /*
                                282                 :                :          * Move entry from the current clock position in the array into the
                                283                 :                :          * hashtable. Use that slot.
                                284                 :                :          */
                                285                 :                :         PrivateRefCountEntry *hashent;
                                286                 :                :         bool        found;
                                287                 :                : 
                                288                 :                :         /* select victim slot */
 3249 bruce@momjian.us          289                 :          84044 :         ReservedRefCountEntry =
 3373 andres@anarazel.de        290                 :          84044 :             &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
                                291                 :                : 
                                292                 :                :         /* Better be used, otherwise we shouldn't get here. */
                                293         [ -  + ]:          84044 :         Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
                                294                 :                : 
                                295                 :                :         /* enter victim array entry into hashtable */
                                296                 :          84044 :         hashent = hash_search(PrivateRefCountHash,
  433 peter@eisentraut.org      297                 :          84044 :                               &(ReservedRefCountEntry->buffer),
                                298                 :                :                               HASH_ENTER,
                                299                 :                :                               &found);
 3373 andres@anarazel.de        300         [ -  + ]:          84044 :         Assert(!found);
                                301                 :          84044 :         hashent->refcount = ReservedRefCountEntry->refcount;
                                302                 :                : 
                                303                 :                :         /* clear the now free array slot */
                                304                 :          84044 :         ReservedRefCountEntry->buffer = InvalidBuffer;
                                305                 :          84044 :         ReservedRefCountEntry->refcount = 0;
                                306                 :                : 
                                307                 :          84044 :         PrivateRefCountOverflowed++;
                                308                 :                :     }
                                309                 :                : }
                                310                 :                : 
                                311                 :                : /*
                                312                 :                :  * Fill a previously reserved refcount entry.
                                313                 :                :  */
                                314                 :                : static PrivateRefCountEntry *
                                315                 :       49607311 : NewPrivateRefCountEntry(Buffer buffer)
                                316                 :                : {
                                317                 :                :     PrivateRefCountEntry *res;
                                318                 :                : 
                                319                 :                :     /* only allowed to be called when a reservation has been made */
                                320         [ -  + ]:       49607311 :     Assert(ReservedRefCountEntry != NULL);
                                321                 :                : 
                                322                 :                :     /* use up the reserved entry */
                                323                 :       49607311 :     res = ReservedRefCountEntry;
                                324                 :       49607311 :     ReservedRefCountEntry = NULL;
                                325                 :                : 
                                326                 :                :     /* and fill it */
                                327                 :       49607311 :     res->buffer = buffer;
                                328                 :       49607311 :     res->refcount = 0;
                                329                 :                : 
                                330                 :       49607311 :     return res;
                                331                 :                : }
                                332                 :                : 
                                333                 :                : /*
                                334                 :                :  * Return the PrivateRefCount entry for the passed buffer.
                                335                 :                :  *
                                336                 :                :  * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
                                337                 :                :  * do_move is true, and the entry resides in the hashtable the entry is
                                338                 :                :  * optimized for frequent access by moving it to the array.
                                339                 :                :  */
                                340                 :                : static PrivateRefCountEntry *
                                341                 :      450035745 : GetPrivateRefCountEntry(Buffer buffer, bool do_move)
                                342                 :                : {
                                343                 :                :     PrivateRefCountEntry *res;
                                344                 :                :     int         i;
                                345                 :                : 
 3515                           346         [ -  + ]:      450035745 :     Assert(BufferIsValid(buffer));
                                347         [ -  + ]:      450035745 :     Assert(!BufferIsLocal(buffer));
                                348                 :                : 
                                349                 :                :     /*
                                350                 :                :      * First search for references in the array, that'll be sufficient in the
                                351                 :                :      * majority of cases.
                                352                 :                :      */
                                353         [ +  + ]:     1274208849 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                                354                 :                :     {
                                355                 :     1223566631 :         res = &PrivateRefCountArray[i];
                                356                 :                : 
                                357         [ +  + ]:     1223566631 :         if (res->buffer == buffer)
                                358                 :      399393527 :             return res;
                                359                 :                :     }
                                360                 :                : 
                                361                 :                :     /*
                                362                 :                :      * By here we know that the buffer, if already pinned, isn't residing in
                                363                 :                :      * the array.
                                364                 :                :      *
                                365                 :                :      * Only look up the buffer in the hashtable if we've previously overflowed
                                366                 :                :      * into it.
                                367                 :                :      */
 3373                           368         [ +  + ]:       50642218 :     if (PrivateRefCountOverflowed == 0)
                                369                 :       49880809 :         return NULL;
                                370                 :                : 
  433 peter@eisentraut.org      371                 :         761409 :     res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
                                372                 :                : 
 3373 andres@anarazel.de        373         [ +  + ]:         761409 :     if (res == NULL)
                                374                 :         245327 :         return NULL;
                                375         [ +  + ]:         516082 :     else if (!do_move)
                                376                 :                :     {
                                377                 :                :         /* caller doesn't want us to move the hash entry into the array */
                                378                 :         508507 :         return res;
                                379                 :                :     }
                                380                 :                :     else
                                381                 :                :     {
                                382                 :                :         /* move buffer from hashtable into the free array slot */
                                383                 :                :         bool        found;
                                384                 :                :         PrivateRefCountEntry *free;
                                385                 :                : 
                                386                 :                :         /* Ensure there's a free array slot */
                                387                 :           7575 :         ReservePrivateRefCountEntry();
                                388                 :                : 
                                389                 :                :         /* Use up the reserved slot */
                                390         [ -  + ]:           7575 :         Assert(ReservedRefCountEntry != NULL);
                                391                 :           7575 :         free = ReservedRefCountEntry;
                                392                 :           7575 :         ReservedRefCountEntry = NULL;
                                393         [ -  + ]:           7575 :         Assert(free->buffer == InvalidBuffer);
                                394                 :                : 
                                395                 :                :         /* and fill it */
                                396                 :           7575 :         free->buffer = buffer;
                                397                 :           7575 :         free->refcount = res->refcount;
                                398                 :                : 
                                399                 :                :         /* delete from hashtable */
  433 peter@eisentraut.org      400                 :           7575 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
 3373 andres@anarazel.de        401         [ -  + ]:           7575 :         Assert(found);
                                402         [ -  + ]:           7575 :         Assert(PrivateRefCountOverflowed > 0);
                                403                 :           7575 :         PrivateRefCountOverflowed--;
                                404                 :                : 
                                405                 :           7575 :         return free;
                                406                 :                :     }
                                407                 :                : }
                                408                 :                : 
                                409                 :                : /*
                                410                 :                :  * Returns how many times the passed buffer is pinned by this backend.
                                411                 :                :  *
                                412                 :                :  * Only works for shared memory buffers!
                                413                 :                :  */
                                414                 :                : static inline int32
 3515                           415                 :      328390583 : GetPrivateRefCount(Buffer buffer)
                                416                 :                : {
                                417                 :                :     PrivateRefCountEntry *ref;
                                418                 :                : 
                                419         [ -  + ]:      328390583 :     Assert(BufferIsValid(buffer));
                                420         [ -  + ]:      328390583 :     Assert(!BufferIsLocal(buffer));
                                421                 :                : 
                                422                 :                :     /*
                                423                 :                :      * Not moving the entry - that's ok for the current users, but we might
                                424                 :                :      * want to change this one day.
                                425                 :                :      */
 3373                           426                 :      328390583 :     ref = GetPrivateRefCountEntry(buffer, false);
                                427                 :                : 
 3515                           428         [ +  + ]:      328390583 :     if (ref == NULL)
                                429                 :         518825 :         return 0;
                                430                 :      327871758 :     return ref->refcount;
                                431                 :                : }
                                432                 :                : 
                                433                 :                : /*
                                434                 :                :  * Release resources used to track the reference count of a buffer which we no
                                435                 :                :  * longer have pinned and don't want to pin again immediately.
                                436                 :                :  */
                                437                 :                : static void
                                438                 :       49607311 : ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
                                439                 :                : {
                                440         [ -  + ]:       49607311 :     Assert(ref->refcount == 0);
                                441                 :                : 
                                442   [ +  -  +  + ]:       49607311 :     if (ref >= &PrivateRefCountArray[0] &&
                                443                 :                :         ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
                                444                 :                :     {
                                445                 :       49530842 :         ref->buffer = InvalidBuffer;
                                446                 :                : 
                                447                 :                :         /*
                                448                 :                :          * Mark the just used entry as reserved - in many scenarios that
                                449                 :                :          * allows us to avoid ever having to search the array/hash for free
                                450                 :                :          * entries.
                                451                 :                :          */
 3373                           452                 :       49530842 :         ReservedRefCountEntry = ref;
                                453                 :                :     }
                                454                 :                :     else
                                455                 :                :     {
                                456                 :                :         bool        found;
 3249 bruce@momjian.us          457                 :          76469 :         Buffer      buffer = ref->buffer;
                                458                 :                : 
  433 peter@eisentraut.org      459                 :          76469 :         hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
 3515 andres@anarazel.de        460         [ -  + ]:          76469 :         Assert(found);
                                461         [ -  + ]:          76469 :         Assert(PrivateRefCountOverflowed > 0);
                                462                 :          76469 :         PrivateRefCountOverflowed--;
                                463                 :                :     }
                                464                 :       49607311 : }
                                465                 :                : 
                                466                 :                : /*
                                467                 :                :  * BufferIsPinned
                                468                 :                :  *      True iff the buffer is pinned (also checks for valid buffer number).
                                469                 :                :  *
                                470                 :                :  *      NOTE: what we check here is that *this* backend holds a pin on
                                471                 :                :  *      the buffer.  We do not care whether some other backend does.
                                472                 :                :  */
                                473                 :                : #define BufferIsPinned(bufnum) \
                                474                 :                : ( \
                                475                 :                :     !BufferIsValid(bufnum) ? \
                                476                 :                :         false \
                                477                 :                :     : \
                                478                 :                :         BufferIsLocal(bufnum) ? \
                                479                 :                :             (LocalRefCount[-(bufnum) - 1] > 0) \
                                480                 :                :         : \
                                481                 :                :     (GetPrivateRefCount(bufnum) > 0) \
                                482                 :                : )
                                483                 :                : 
                                484                 :                : 
                                485                 :                : static Buffer ReadBuffer_common(Relation rel,
                                486                 :                :                                 SMgrRelation smgr, char smgr_persistence,
                                487                 :                :                                 ForkNumber forkNum, BlockNumber blockNum,
                                488                 :                :                                 ReadBufferMode mode, BufferAccessStrategy strategy);
                                489                 :                : static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
                                490                 :                :                                            ForkNumber fork,
                                491                 :                :                                            BufferAccessStrategy strategy,
                                492                 :                :                                            uint32 flags,
                                493                 :                :                                            uint32 extend_by,
                                494                 :                :                                            BlockNumber extend_upto,
                                495                 :                :                                            Buffer *buffers,
                                496                 :                :                                            uint32 *extended_by);
                                497                 :                : static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
                                498                 :                :                                            ForkNumber fork,
                                499                 :                :                                            BufferAccessStrategy strategy,
                                500                 :                :                                            uint32 flags,
                                501                 :                :                                            uint32 extend_by,
                                502                 :                :                                            BlockNumber extend_upto,
                                503                 :                :                                            Buffer *buffers,
                                504                 :                :                                            uint32 *extended_by);
                                505                 :                : static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
                                506                 :                : static void PinBuffer_Locked(BufferDesc *buf);
                                507                 :                : static void UnpinBuffer(BufferDesc *buf);
                                508                 :                : static void UnpinBufferNoOwner(BufferDesc *buf);
                                509                 :                : static void BufferSync(int flags);
                                510                 :                : static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
                                511                 :                : static int  SyncOneBuffer(int buf_id, bool skip_recently_used,
                                512                 :                :                           WritebackContext *wb_context);
                                513                 :                : static void WaitIO(BufferDesc *buf);
                                514                 :                : static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
                                515                 :                : static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
                                516                 :                :                               uint32 set_flag_bits, bool forget_owner);
                                517                 :                : static void AbortBufferIO(Buffer buffer);
                                518                 :                : static void shared_buffer_write_error_callback(void *arg);
                                519                 :                : static void local_buffer_write_error_callback(void *arg);
                                520                 :                : static inline BufferDesc *BufferAlloc(SMgrRelation smgr,
                                521                 :                :                                       char relpersistence,
                                522                 :                :                                       ForkNumber forkNum,
                                523                 :                :                                       BlockNumber blockNum,
                                524                 :                :                                       BufferAccessStrategy strategy,
                                525                 :                :                                       bool *foundPtr, IOContext io_context);
                                526                 :                : static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
                                527                 :                : static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
                                528                 :                :                         IOObject io_object, IOContext io_context);
                                529                 :                : static void FindAndDropRelationBuffers(RelFileLocator rlocator,
                                530                 :                :                                        ForkNumber forkNum,
                                531                 :                :                                        BlockNumber nForkBlock,
                                532                 :                :                                        BlockNumber firstDelBlock);
                                533                 :                : static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
                                534                 :                :                                            RelFileLocator dstlocator,
                                535                 :                :                                            ForkNumber forkNum, bool permanent);
                                536                 :                : static void AtProcExit_Buffers(int code, Datum arg);
                                537                 :                : static void CheckForBufferLeaks(void);
                                538                 :                : static int  rlocator_comparator(const void *p1, const void *p2);
                                539                 :                : static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
                                540                 :                : static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
                                541                 :                : static int  ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
                                542                 :                : 
                                543                 :                : 
                                544                 :                : /*
                                545                 :                :  * Implementation of PrefetchBuffer() for shared buffers.
                                546                 :                :  */
                                547                 :                : PrefetchBufferResult
 1467 tmunro@postgresql.or      548                 :         766779 : PrefetchSharedBuffer(SMgrRelation smgr_reln,
                                549                 :                :                      ForkNumber forkNum,
                                550                 :                :                      BlockNumber blockNum)
                                551                 :                : {
                                552                 :         766779 :     PrefetchBufferResult result = {InvalidBuffer, false};
                                553                 :                :     BufferTag   newTag;         /* identity of requested block */
                                554                 :                :     uint32      newHash;        /* hash value for newTag */
                                555                 :                :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
                                556                 :                :     int         buf_id;
                                557                 :                : 
                                558         [ -  + ]:         766779 :     Assert(BlockNumberIsValid(blockNum));
                                559                 :                : 
                                560                 :                :     /* create a tag so we can lookup the buffer */
  627 rhaas@postgresql.org      561                 :         766779 :     InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
                                562                 :                :                   forkNum, blockNum);
                                563                 :                : 
                                564                 :                :     /* determine its hash code and partition lock ID */
 1467 tmunro@postgresql.or      565                 :         766779 :     newHash = BufTableHashCode(&newTag);
                                566                 :         766779 :     newPartitionLock = BufMappingPartitionLock(newHash);
                                567                 :                : 
                                568                 :                :     /* see if the block is in the buffer pool already */
                                569                 :         766779 :     LWLockAcquire(newPartitionLock, LW_SHARED);
                                570                 :         766779 :     buf_id = BufTableLookup(&newTag, newHash);
                                571                 :         766779 :     LWLockRelease(newPartitionLock);
                                572                 :                : 
                                573                 :                :     /* If not in buffers, initiate prefetch */
                                574         [ +  + ]:         766779 :     if (buf_id < 0)
                                575                 :                :     {
                                576                 :                : #ifdef USE_PREFETCH
                                577                 :                :         /*
                                578                 :                :          * Try to initiate an asynchronous read.  This returns false in
                                579                 :                :          * recovery if the relation file doesn't exist.
                                580                 :                :          */
  372                           581   [ +  +  +  - ]:         274567 :         if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
  120 tmunro@postgresql.or      582                 :GNC      137173 :             smgrprefetch(smgr_reln, forkNum, blockNum, 1))
                                583                 :                :         {
 1467 tmunro@postgresql.or      584                 :CBC      137173 :             result.initiated_io = true;
                                585                 :                :         }
                                586                 :                : #endif                          /* USE_PREFETCH */
                                587                 :                :     }
                                588                 :                :     else
                                589                 :                :     {
                                590                 :                :         /*
                                591                 :                :          * Report the buffer it was in at that time.  The caller may be able
                                592                 :                :          * to avoid a buffer table lookup, but it's not pinned and it must be
                                593                 :                :          * rechecked!
                                594                 :                :          */
                                595                 :         629385 :         result.recent_buffer = buf_id + 1;
                                596                 :                :     }
                                597                 :                : 
                                598                 :                :     /*
                                599                 :                :      * If the block *is* in buffers, we do nothing.  This is not really ideal:
                                600                 :                :      * the block might be just about to be evicted, which would be stupid
                                601                 :                :      * since we know we are going to need it soon.  But the only easy answer
                                602                 :                :      * is to bump the usage_count, which does not seem like a great solution:
                                603                 :                :      * when the caller does ultimately touch the block, usage_count would get
                                604                 :                :      * bumped again, resulting in too much favoritism for blocks that are
                                605                 :                :      * involved in a prefetch sequence. A real fix would involve some
                                606                 :                :      * additional per-buffer state, and it's not clear that there's enough of
                                607                 :                :      * a problem to justify that.
                                608                 :                :      */
                                609                 :                : 
                                610                 :         766779 :     return result;
                                611                 :                : }
                                612                 :                : 
                                613                 :                : /*
                                614                 :                :  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
                                615                 :                :  *
                                616                 :                :  * This is named by analogy to ReadBuffer but doesn't actually allocate a
                                617                 :                :  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
                                618                 :                :  * block will not be delayed by the I/O.  Prefetching is optional.
                                619                 :                :  *
                                620                 :                :  * There are three possible outcomes:
                                621                 :                :  *
                                622                 :                :  * 1.  If the block is already cached, the result includes a valid buffer that
                                623                 :                :  * could be used by the caller to avoid the need for a later buffer lookup, but
                                624                 :                :  * it's not pinned, so the caller must recheck it.
                                625                 :                :  *
                                626                 :                :  * 2.  If the kernel has been asked to initiate I/O, the initiated_io member is
                                627                 :                :  * true.  Currently there is no way to know if the data was already cached by
                                628                 :                :  * the kernel and therefore didn't really initiate I/O, and no way to know when
                                629                 :                :  * the I/O completes other than using synchronous ReadBuffer().
                                630                 :                :  *
                                631                 :                :  * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
                                632                 :                :  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
                                633                 :                :  * lack of a kernel facility), direct I/O is enabled, or the underlying
                                634                 :                :  * relation file wasn't found and we are in recovery.  (If the relation file
                                635                 :                :  * wasn't found and we are not in recovery, an error is raised).
                                636                 :                :  */
                                637                 :                : PrefetchBufferResult
 5571 tgl@sss.pgh.pa.us         638                 :         196025 : PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
                                639                 :                : {
                                640         [ -  + ]:         196025 :     Assert(RelationIsValid(reln));
                                641         [ -  + ]:         196025 :     Assert(BlockNumberIsValid(blockNum));
                                642                 :                : 
 4871 rhaas@postgresql.org      643         [ +  + ]:         196025 :     if (RelationUsesLocalBuffers(reln))
                                644                 :                :     {
                                645                 :                :         /* see comments in ReadBufferExtended */
 5493 tgl@sss.pgh.pa.us         646   [ +  -  -  + ]:           3100 :         if (RELATION_IS_OTHER_TEMP(reln))
 5493 tgl@sss.pgh.pa.us         647         [ #  # ]:UBC           0 :             ereport(ERROR,
                                648                 :                :                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                649                 :                :                      errmsg("cannot access temporary tables of other sessions")));
                                650                 :                : 
                                651                 :                :         /* pass it off to localbuf.c */
 1007 tgl@sss.pgh.pa.us         652                 :CBC        3100 :         return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
                                653                 :                :     }
                                654                 :                :     else
                                655                 :                :     {
                                656                 :                :         /* pass it to the shared buffer version */
                                657                 :         192925 :         return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
                                658                 :                :     }
                                659                 :                : }
                                660                 :                : 
                                661                 :                : /*
                                662                 :                :  * ReadRecentBuffer -- try to pin a block in a recently observed buffer
                                663                 :                :  *
                                664                 :                :  * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
                                665                 :                :  * successful.  Return true if the buffer is valid and still has the expected
                                666                 :                :  * tag.  In that case, the buffer is pinned and the usage count is bumped.
                                667                 :                :  */
                                668                 :                : bool
  648 rhaas@postgresql.org      669                 :         518831 : ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
                                670                 :                :                  Buffer recent_buffer)
                                671                 :                : {
                                672                 :                :     BufferDesc *bufHdr;
                                673                 :                :     BufferTag   tag;
                                674                 :                :     uint32      buf_state;
                                675                 :                :     bool        have_private_ref;
                                676                 :                : 
 1102 tmunro@postgresql.or      677         [ -  + ]:         518831 :     Assert(BufferIsValid(recent_buffer));
                                678                 :                : 
  158 heikki.linnakangas@i      679                 :GNC      518831 :     ResourceOwnerEnlarge(CurrentResourceOwner);
 1102 tmunro@postgresql.or      680                 :CBC      518831 :     ReservePrivateRefCountEntry();
  627 rhaas@postgresql.org      681                 :         518831 :     InitBufferTag(&tag, &rlocator, forkNum, blockNum);
                                682                 :                : 
 1102 tmunro@postgresql.or      683         [ -  + ]:         518831 :     if (BufferIsLocal(recent_buffer))
                                684                 :                :     {
  629 heikki.linnakangas@i      685                 :UBC           0 :         int         b = -recent_buffer - 1;
                                686                 :                : 
                                687                 :              0 :         bufHdr = GetLocalBufferDescriptor(b);
 1102 tmunro@postgresql.or      688                 :              0 :         buf_state = pg_atomic_read_u32(&bufHdr->state);
                                689                 :                : 
                                690                 :                :         /* Is it still valid and holding the right tag? */
  627 rhaas@postgresql.org      691   [ #  #  #  # ]:              0 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
                                692                 :                :         {
  375 andres@anarazel.de        693                 :              0 :             PinLocalBuffer(bufHdr, true);
                                694                 :                : 
  738 tmunro@postgresql.or      695                 :              0 :             pgBufferUsage.local_blks_hit++;
                                696                 :                : 
 1102                           697                 :              0 :             return true;
                                698                 :                :         }
                                699                 :                :     }
                                700                 :                :     else
                                701                 :                :     {
 1102 tmunro@postgresql.or      702                 :CBC      518831 :         bufHdr = GetBufferDescriptor(recent_buffer - 1);
                                703                 :         518831 :         have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
                                704                 :                : 
                                705                 :                :         /*
                                706                 :                :          * Do we already have this buffer pinned with a private reference?  If
                                707                 :                :          * so, it must be valid and it is safe to check the tag without
                                708                 :                :          * locking.  If not, we have to lock the header first and then check.
                                709                 :                :          */
                                710         [ +  + ]:         518831 :         if (have_private_ref)
                                711                 :              7 :             buf_state = pg_atomic_read_u32(&bufHdr->state);
                                712                 :                :         else
                                713                 :         518824 :             buf_state = LockBufHdr(bufHdr);
                                714                 :                : 
  627 rhaas@postgresql.org      715   [ +  +  +  + ]:         518831 :         if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
                                716                 :                :         {
                                717                 :                :             /*
                                718                 :                :              * It's now safe to pin the buffer.  We can't pin first and ask
                                719                 :                :              * questions later, because it might confuse code paths like
                                720                 :                :              * InvalidateBuffer() if we pinned a random non-matching buffer.
                                721                 :                :              */
 1102 tmunro@postgresql.or      722         [ -  + ]:         516485 :             if (have_private_ref)
 1102 tmunro@postgresql.or      723                 :UBC           0 :                 PinBuffer(bufHdr, NULL);    /* bump pin count */
                                724                 :                :             else
 1102 tmunro@postgresql.or      725                 :CBC      516485 :                 PinBuffer_Locked(bufHdr);   /* pin for first time */
                                726                 :                : 
  738                           727                 :         516485 :             pgBufferUsage.shared_blks_hit++;
                                728                 :                : 
 1102                           729                 :         516485 :             return true;
                                730                 :                :         }
                                731                 :                : 
                                732                 :                :         /* If we locked the header above, now unlock. */
                                733         [ +  + ]:           2346 :         if (!have_private_ref)
                                734                 :           2339 :             UnlockBufHdr(bufHdr, buf_state);
                                735                 :                :     }
                                736                 :                : 
                                737                 :           2346 :     return false;
                                738                 :                : }
                                739                 :                : 
                                740                 :                : /*
                                741                 :                :  * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
                                742                 :                :  *      fork with RBM_NORMAL mode and default strategy.
                                743                 :                :  */
                                744                 :                : Buffer
 5644 heikki.linnakangas@i      745                 :       38016719 : ReadBuffer(Relation reln, BlockNumber blockNum)
                                746                 :                : {
                                747                 :       38016719 :     return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
                                748                 :                : }
                                749                 :                : 
                                750                 :                : /*
                                751                 :                :  * ReadBufferExtended -- returns a buffer containing the requested
                                752                 :                :  *      block of the requested relation.  If the blknum
                                753                 :                :  *      requested is P_NEW, extend the relation file and
                                754                 :                :  *      allocate a new block.  (Caller is responsible for
                                755                 :                :  *      ensuring that only one backend tries to extend a
                                756                 :                :  *      relation at the same time!)
                                757                 :                :  *
                                758                 :                :  * Returns: the buffer number for the buffer containing
                                759                 :                :  *      the block read.  The returned buffer has been pinned.
                                760                 :                :  *      Does not return on error --- elog's instead.
                                761                 :                :  *
                                762                 :                :  * Assume when this function is called, that reln has been opened already.
                                763                 :                :  *
                                764                 :                :  * In RBM_NORMAL mode, the page is read from disk, and the page header is
                                765                 :                :  * validated.  An error is thrown if the page header is not valid.  (But
                                766                 :                :  * note that an all-zero page is considered "valid"; see
                                767                 :                :  * PageIsVerifiedExtended().)
                                768                 :                :  *
                                769                 :                :  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
                                770                 :                :  * valid, the page is zeroed instead of throwing an error. This is intended
                                771                 :                :  * for non-critical data, where the caller is prepared to repair errors.
                                772                 :                :  *
                                773                 :                :  * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
                                774                 :                :  * filled with zeros instead of reading it from disk.  Useful when the caller
                                775                 :                :  * is going to fill the page from scratch, since this saves I/O and avoids
                                776                 :                :  * unnecessary failure if the page-on-disk has corrupt page headers.
                                777                 :                :  * The page is returned locked to ensure that the caller has a chance to
                                778                 :                :  * initialize the page before it's made visible to others.
                                779                 :                :  * Caution: do not use this mode to read a page that is beyond the relation's
                                780                 :                :  * current physical EOF; that is likely to cause problems in md.c when
                                781                 :                :  * the page is modified and written out. P_NEW is OK, though.
                                782                 :                :  *
                                783                 :                :  * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
                                784                 :                :  * a cleanup-strength lock on the page.
                                785                 :                :  *
                                786                 :                :  * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
                                787                 :                :  *
                                788                 :                :  * If strategy is not NULL, a nondefault buffer access strategy is used.
                                789                 :                :  * See buffer/README for details.
                                790                 :                :  */
                                791                 :                : inline Buffer
                                792                 :       46293649 : ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
                                793                 :                :                    ReadBufferMode mode, BufferAccessStrategy strategy)
                                794                 :                : {
                                795                 :                :     Buffer      buf;
                                796                 :                : 
                                797                 :                :     /*
                                798                 :                :      * Reject attempts to read non-local temporary relations; we would be
                                799                 :                :      * likely to get wrong data since we have no visibility into the owning
                                800                 :                :      * session's local buffers.
                                801                 :                :      */
 5493 tgl@sss.pgh.pa.us         802   [ +  +  -  + ]:       46293649 :     if (RELATION_IS_OTHER_TEMP(reln))
 5493 tgl@sss.pgh.pa.us         803         [ #  # ]:UBC           0 :         ereport(ERROR,
                                804                 :                :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                                805                 :                :                  errmsg("cannot access temporary tables of other sessions")));
                                806                 :                : 
                                807                 :                :     /*
                                808                 :                :      * Read the buffer, and update pgstat counters to reflect a cache hit or
                                809                 :                :      * miss.
                                810                 :                :      */
   11 tmunro@postgresql.or      811                 :GNC    46293649 :     buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
                                812                 :                :                             forkNum, blockNum, mode, strategy);
                                813                 :                : 
 5785 heikki.linnakangas@i      814                 :CBC    46293633 :     return buf;
                                815                 :                : }
                                816                 :                : 
                                817                 :                : 
                                818                 :                : /*
                                819                 :                :  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
                                820                 :                :  *      a relcache entry for the relation.
                                821                 :                :  *
                                822                 :                :  * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
                                823                 :                :  * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
                                824                 :                :  * cannot be used for temporary relations (and making that work might be
                                825                 :                :  * difficult, unless we only want to read temporary relations for our own
                                826                 :                :  * ProcNumber).
                                827                 :                :  */
                                828                 :                : Buffer
  648 rhaas@postgresql.org      829                 :        3345591 : ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
                                830                 :                :                           BlockNumber blockNum, ReadBufferMode mode,
                                831                 :                :                           BufferAccessStrategy strategy, bool permanent)
                                832                 :                : {
   42 heikki.linnakangas@i      833                 :GNC     3345591 :     SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
                                834                 :                : 
   11 tmunro@postgresql.or      835         [ +  - ]:        3345591 :     return ReadBuffer_common(NULL, smgr,
                                836                 :                :                              permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
                                837                 :                :                              forkNum, blockNum,
                                838                 :                :                              mode, strategy);
                                839                 :                : }
                                840                 :                : 
                                841                 :                : /*
                                842                 :                :  * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
                                843                 :                :  */
                                844                 :                : Buffer
  235 tmunro@postgresql.or      845                 :CBC       43892 : ExtendBufferedRel(BufferManagerRelation bmr,
                                846                 :                :                   ForkNumber forkNum,
                                847                 :                :                   BufferAccessStrategy strategy,
                                848                 :                :                   uint32 flags)
                                849                 :                : {
                                850                 :                :     Buffer      buf;
  375 andres@anarazel.de        851                 :          43892 :     uint32      extend_by = 1;
                                852                 :                : 
  235 tmunro@postgresql.or      853                 :          43892 :     ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
                                854                 :                :                         &buf, &extend_by);
                                855                 :                : 
  375 andres@anarazel.de        856                 :          43892 :     return buf;
                                857                 :                : }
                                858                 :                : 
                                859                 :                : /*
                                860                 :                :  * Extend relation by multiple blocks.
                                861                 :                :  *
                                862                 :                :  * Tries to extend the relation by extend_by blocks. Depending on the
                                863                 :                :  * availability of resources the relation may end up being extended by a
                                864                 :                :  * smaller number of pages (unless an error is thrown, always by at least one
                                865                 :                :  * page). *extended_by is updated to the number of pages the relation has been
                                866                 :                :  * extended to.
                                867                 :                :  *
                                868                 :                :  * buffers needs to be an array that is at least extend_by long. Upon
                                869                 :                :  * completion, the first extend_by array elements will point to a pinned
                                870                 :                :  * buffer.
                                871                 :                :  *
                                872                 :                :  * If EB_LOCK_FIRST is part of flags, the first returned buffer is
                                873                 :                :  * locked. This is useful for callers that want a buffer that is guaranteed to
                                874                 :                :  * be empty.
                                875                 :                :  */
                                876                 :                : BlockNumber
  235 tmunro@postgresql.or      877                 :         140189 : ExtendBufferedRelBy(BufferManagerRelation bmr,
                                878                 :                :                     ForkNumber fork,
                                879                 :                :                     BufferAccessStrategy strategy,
                                880                 :                :                     uint32 flags,
                                881                 :                :                     uint32 extend_by,
                                882                 :                :                     Buffer *buffers,
                                883                 :                :                     uint32 *extended_by)
                                884                 :                : {
                                885         [ -  + ]:         140189 :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
                                886   [ -  +  -  - ]:         140189 :     Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
  375 andres@anarazel.de        887         [ -  + ]:         140189 :     Assert(extend_by > 0);
                                888                 :                : 
  235 tmunro@postgresql.or      889         [ +  - ]:         140189 :     if (bmr.smgr == NULL)
                                890                 :                :     {
                                891                 :         140189 :         bmr.smgr = RelationGetSmgr(bmr.rel);
                                892                 :         140189 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
                                893                 :                :     }
                                894                 :                : 
                                895                 :         140189 :     return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
                                896                 :                :                                    extend_by, InvalidBlockNumber,
                                897                 :                :                                    buffers, extended_by);
                                898                 :                : }
                                899                 :                : 
                                900                 :                : /*
                                901                 :                :  * Extend the relation so it is at least extend_to blocks large, return buffer
                                902                 :                :  * (extend_to - 1).
                                903                 :                :  *
                                904                 :                :  * This is useful for callers that want to write a specific page, regardless
                                905                 :                :  * of the current size of the relation (e.g. useful for visibilitymap and for
                                906                 :                :  * crash recovery).
                                907                 :                :  */
                                908                 :                : Buffer
                                909                 :          53605 : ExtendBufferedRelTo(BufferManagerRelation bmr,
                                910                 :                :                     ForkNumber fork,
                                911                 :                :                     BufferAccessStrategy strategy,
                                912                 :                :                     uint32 flags,
                                913                 :                :                     BlockNumber extend_to,
                                914                 :                :                     ReadBufferMode mode)
                                915                 :                : {
                                916                 :                :     BlockNumber current_size;
  375 andres@anarazel.de        917                 :          53605 :     uint32      extended_by = 0;
                                918                 :          53605 :     Buffer      buffer = InvalidBuffer;
                                919                 :                :     Buffer      buffers[64];
                                920                 :                : 
  235 tmunro@postgresql.or      921         [ -  + ]:          53605 :     Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
                                922   [ +  +  -  + ]:          53605 :     Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
  375 andres@anarazel.de        923   [ +  -  -  + ]:          53605 :     Assert(extend_to != InvalidBlockNumber && extend_to > 0);
                                924                 :                : 
  235 tmunro@postgresql.or      925         [ +  + ]:          53605 :     if (bmr.smgr == NULL)
                                926                 :                :     {
                                927                 :           5748 :         bmr.smgr = RelationGetSmgr(bmr.rel);
                                928                 :           5748 :         bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
                                929                 :                :     }
                                930                 :                : 
                                931                 :                :     /*
                                932                 :                :      * If desired, create the file if it doesn't exist.  If
                                933                 :                :      * smgr_cached_nblocks[fork] is positive then it must exist, no need for
                                934                 :                :      * an smgrexists call.
                                935                 :                :      */
  375 andres@anarazel.de        936         [ +  + ]:          53605 :     if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
  235 tmunro@postgresql.or      937         [ +  + ]:           5748 :         (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
                                938         [ -  + ]:             12 :          bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
                                939         [ +  + ]:           5736 :         !smgrexists(bmr.smgr, fork))
                                940                 :                :     {
                                941                 :           5727 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
                                942                 :                : 
                                943                 :                :         /* recheck, fork might have been created concurrently */
                                944         [ +  + ]:           5727 :         if (!smgrexists(bmr.smgr, fork))
                                945                 :           5716 :             smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
                                946                 :                : 
                                947                 :           5727 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
                                948                 :                :     }
                                949                 :                : 
                                950                 :                :     /*
                                951                 :                :      * If requested, invalidate size cache, so that smgrnblocks asks the
                                952                 :                :      * kernel.
                                953                 :                :      */
  375 andres@anarazel.de        954         [ +  + ]:          53605 :     if (flags & EB_CLEAR_SIZE_CACHE)
  235 tmunro@postgresql.or      955                 :           5748 :         bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
                                956                 :                : 
                                957                 :                :     /*
                                958                 :                :      * Estimate how many pages we'll need to extend by. This avoids acquiring
                                959                 :                :      * unnecessarily many victim buffers.
                                960                 :                :      */
                                961                 :          53605 :     current_size = smgrnblocks(bmr.smgr, fork);
                                962                 :                : 
                                963                 :                :     /*
                                964                 :                :      * Since no-one else can be looking at the page contents yet, there is no
                                965                 :                :      * difference between an exclusive lock and a cleanup-strength lock. Note
                                966                 :                :      * that we pass the original mode to ReadBuffer_common() below, when
                                967                 :                :      * falling back to reading the buffer to a concurrent relation extension.
                                968                 :                :      */
  366 andres@anarazel.de        969   [ +  +  +  + ]:          53605 :     if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
  375                           970                 :          47502 :         flags |= EB_LOCK_TARGET;
                                971                 :                : 
                                972         [ +  + ]:         109213 :     while (current_size < extend_to)
                                973                 :                :     {
                                974                 :          55608 :         uint32      num_pages = lengthof(buffers);
                                975                 :                :         BlockNumber first_block;
                                976                 :                : 
                                977         [ +  + ]:          55608 :         if ((uint64) current_size + num_pages > extend_to)
                                978                 :          55542 :             num_pages = extend_to - current_size;
                                979                 :                : 
  235 tmunro@postgresql.or      980                 :          55608 :         first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
                                981                 :                :                                               num_pages, extend_to,
                                982                 :                :                                               buffers, &extended_by);
                                983                 :                : 
  375 andres@anarazel.de        984                 :          55608 :         current_size = first_block + extended_by;
                                985   [ -  +  -  - ]:          55608 :         Assert(num_pages != 0 || current_size >= extend_to);
                                986                 :                : 
  208 peter@eisentraut.org      987         [ +  + ]:GNC      117199 :         for (uint32 i = 0; i < extended_by; i++)
                                988                 :                :         {
  375 andres@anarazel.de        989         [ +  + ]:CBC       61591 :             if (first_block + i != extend_to - 1)
                                990                 :           7999 :                 ReleaseBuffer(buffers[i]);
                                991                 :                :             else
                                992                 :          53592 :                 buffer = buffers[i];
                                993                 :                :         }
                                994                 :                :     }
                                995                 :                : 
                                996                 :                :     /*
                                997                 :                :      * It's possible that another backend concurrently extended the relation.
                                998                 :                :      * In that case read the buffer.
                                999                 :                :      *
                               1000                 :                :      * XXX: Should we control this via a flag?
                               1001                 :                :      */
                               1002         [ +  + ]:          53605 :     if (buffer == InvalidBuffer)
                               1003                 :                :     {
                               1004         [ -  + ]:             13 :         Assert(extended_by == 0);
   11 tmunro@postgresql.or     1005                 :GNC          13 :         buffer = ReadBuffer_common(bmr.rel, bmr.smgr, 0,
                               1006                 :                :                                    fork, extend_to - 1, mode, strategy);
                               1007                 :                :     }
                               1008                 :                : 
  375 andres@anarazel.de       1009                 :CBC       53605 :     return buffer;
                               1010                 :                : }
                               1011                 :                : 
                               1012                 :                : /*
                               1013                 :                :  * Zero a buffer and lock it, as part of the implementation of
                               1014                 :                :  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
                               1015                 :                :  * pinned.  It does not have to be valid, but it is valid and locked on
                               1016                 :                :  * return.
                               1017                 :                :  */
                               1018                 :                : static void
   11 tmunro@postgresql.or     1019                 :GNC      265372 : ZeroBuffer(Buffer buffer, ReadBufferMode mode)
                               1020                 :                : {
                               1021                 :                :     BufferDesc *bufHdr;
                               1022                 :                :     uint32      buf_state;
                               1023                 :                : 
                               1024   [ +  +  -  + ]:         265372 :     Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
                               1025                 :                : 
                               1026         [ -  + ]:         265372 :     if (BufferIsLocal(buffer))
   11 tmunro@postgresql.or     1027                 :UNC           0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               1028                 :                :     else
                               1029                 :                :     {
   11 tmunro@postgresql.or     1030                 :GNC      265372 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               1031         [ +  + ]:         265372 :         if (mode == RBM_ZERO_AND_LOCK)
                               1032                 :         263897 :             LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
                               1033                 :                :         else
                               1034                 :           1475 :             LockBufferForCleanup(buffer);
                               1035                 :                :     }
                               1036                 :                : 
                               1037                 :         265372 :     memset(BufferGetPage(buffer), 0, BLCKSZ);
                               1038                 :                : 
                               1039         [ -  + ]:         265372 :     if (BufferIsLocal(buffer))
                               1040                 :                :     {
   11 tmunro@postgresql.or     1041                 :UNC           0 :         buf_state = pg_atomic_read_u32(&bufHdr->state);
                               1042                 :              0 :         buf_state |= BM_VALID;
                               1043                 :              0 :         pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                               1044                 :                :     }
                               1045                 :                :     else
                               1046                 :                :     {
   11 tmunro@postgresql.or     1047                 :GNC      265372 :         buf_state = LockBufHdr(bufHdr);
                               1048                 :         265372 :         buf_state |= BM_VALID;
                               1049                 :         265372 :         UnlockBufHdr(bufHdr, buf_state);
                               1050                 :                :     }
                               1051                 :         265372 : }
                               1052                 :                : 
                               1053                 :                : /*
                               1054                 :                :  * Pin a buffer for a given block.  *foundPtr is set to true if the block was
                               1055                 :                :  * already present, or false if more work is required to either read it in or
                               1056                 :                :  * zero it.
                               1057                 :                :  */
                               1058                 :                : static pg_attribute_always_inline Buffer
                               1059                 :       51857100 : PinBufferForBlock(Relation rel,
                               1060                 :                :                   SMgrRelation smgr,
                               1061                 :                :                   char smgr_persistence,
                               1062                 :                :                   ForkNumber forkNum,
                               1063                 :                :                   BlockNumber blockNum,
                               1064                 :                :                   BufferAccessStrategy strategy,
                               1065                 :                :                   bool *foundPtr)
                               1066                 :                : {
                               1067                 :                :     BufferDesc *bufHdr;
                               1068                 :                :     IOContext   io_context;
                               1069                 :                :     IOObject    io_object;
                               1070                 :                :     char        persistence;
                               1071                 :                : 
                               1072         [ -  + ]:       51857100 :     Assert(blockNum != P_NEW);
                               1073                 :                : 
                               1074                 :                :     /*
                               1075                 :                :      * If there is no Relation it usually implies recovery and thus permanent,
                               1076                 :                :      * but we take an argmument because CreateAndCopyRelationData can reach us
                               1077                 :                :      * with only an SMgrRelation for an unlogged relation that we don't want
                               1078                 :                :      * to flag with BM_PERMANENT.
                               1079                 :                :      */
                               1080         [ +  + ]:       51857100 :     if (rel)
                               1081                 :       48511509 :         persistence = rel->rd_rel->relpersistence;
                               1082         [ -  + ]:        3345591 :     else if (smgr_persistence == 0)
   11 tmunro@postgresql.or     1083                 :UNC           0 :         persistence = RELPERSISTENCE_PERMANENT;
                               1084                 :                :     else
   11 tmunro@postgresql.or     1085                 :GNC     3345591 :         persistence = smgr_persistence;
                               1086                 :                : 
                               1087         [ +  + ]:       51857100 :     if (persistence == RELPERSISTENCE_TEMP)
                               1088                 :                :     {
                               1089                 :        1254204 :         io_context = IOCONTEXT_NORMAL;
                               1090                 :        1254204 :         io_object = IOOBJECT_TEMP_RELATION;
                               1091                 :                :     }
                               1092                 :                :     else
                               1093                 :                :     {
                               1094                 :       50602896 :         io_context = IOContextForStrategy(strategy);
                               1095                 :       50602896 :         io_object = IOOBJECT_RELATION;
                               1096                 :                :     }
                               1097                 :                : 
                               1098                 :                :     TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
                               1099                 :                :                                        smgr->smgr_rlocator.locator.spcOid,
                               1100                 :                :                                        smgr->smgr_rlocator.locator.dbOid,
                               1101                 :                :                                        smgr->smgr_rlocator.locator.relNumber,
                               1102                 :                :                                        smgr->smgr_rlocator.backend);
                               1103                 :                : 
                               1104         [ +  + ]:       51857100 :     if (persistence == RELPERSISTENCE_TEMP)
                               1105                 :                :     {
                               1106                 :        1254204 :         bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
                               1107         [ +  + ]:        1254204 :         if (*foundPtr)
 5234 rhaas@postgresql.org     1108                 :        1250405 :             pgBufferUsage.local_blks_hit++;
                               1109                 :                :     }
                               1110                 :                :     else
                               1111                 :                :     {
   11 tmunro@postgresql.or     1112                 :       50602896 :         bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
                               1113                 :                :                              strategy, foundPtr, io_context);
                               1114         [ +  + ]:       50602895 :         if (*foundPtr)
                               1115                 :       49223226 :             pgBufferUsage.shared_blks_hit++;
                               1116                 :                :     }
                               1117         [ +  + ]:       51857099 :     if (rel)
                               1118                 :                :     {
                               1119                 :                :         /*
                               1120                 :                :          * While pgBufferUsage's "read" counter isn't bumped unless we reach
                               1121                 :                :          * WaitReadBuffers() (so, not for hits, and not for buffers that are
                               1122                 :                :          * zeroed instead), the per-relation stats always count them.
                               1123                 :                :          */
                               1124   [ +  +  +  +  :       48511508 :         pgstat_count_buffer_read(rel);
                                              +  + ]
                               1125         [ +  + ]:       48511508 :         if (*foundPtr)
                               1126   [ +  +  -  +  :       47552299 :             pgstat_count_buffer_hit(rel);
                                              +  + ]
                               1127                 :                :     }
                               1128         [ +  + ]:       51857099 :     if (*foundPtr)
                               1129                 :                :     {
  375 andres@anarazel.de       1130                 :       50473631 :         VacuumPageHit++;
                               1131                 :       50473631 :         pgstat_count_io_op(io_object, io_context, IOOP_HIT);
                               1132         [ +  + ]:       50473631 :         if (VacuumCostActive)
                               1133                 :        1864214 :             VacuumCostBalance += VacuumCostPageHit;
                               1134                 :                : 
                               1135                 :                :         TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
                               1136                 :                :                                           smgr->smgr_rlocator.locator.spcOid,
                               1137                 :                :                                           smgr->smgr_rlocator.locator.dbOid,
                               1138                 :                :                                           smgr->smgr_rlocator.locator.relNumber,
                               1139                 :                :                                           smgr->smgr_rlocator.backend,
                               1140                 :                :                                           true);
                               1141                 :                :     }
                               1142                 :                : 
   11 tmunro@postgresql.or     1143                 :       51857099 :     return BufferDescriptorGetBuffer(bufHdr);
                               1144                 :                : }
                               1145                 :                : 
                               1146                 :                : /*
                               1147                 :                :  * ReadBuffer_common -- common logic for all ReadBuffer variants
                               1148                 :                :  *
                               1149                 :                :  * smgr is required, rel is optional unless using P_NEW.
                               1150                 :                :  */
                               1151                 :                : static pg_attribute_always_inline Buffer
                               1152                 :       49639253 : ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
                               1153                 :                :                   ForkNumber forkNum,
                               1154                 :                :                   BlockNumber blockNum, ReadBufferMode mode,
                               1155                 :                :                   BufferAccessStrategy strategy)
                               1156                 :                : {
                               1157                 :                :     ReadBuffersOperation operation;
                               1158                 :                :     Buffer      buffer;
                               1159                 :                :     int         flags;
                               1160                 :                : 
                               1161                 :                :     /*
                               1162                 :                :      * Backward compatibility path, most code should use ExtendBufferedRel()
                               1163                 :                :      * instead, as acquiring the extension lock inside ExtendBufferedRel()
                               1164                 :                :      * scales a lot better.
                               1165                 :                :      */
   11 tmunro@postgresql.or     1166         [ +  + ]:CBC    49639253 :     if (unlikely(blockNum == P_NEW))
                               1167                 :                :     {
                               1168                 :            240 :         uint32      flags = EB_SKIP_EXTENSION_LOCK;
                               1169                 :                : 
                               1170                 :                :         /*
                               1171                 :                :          * Since no-one else can be looking at the page contents yet, there is
                               1172                 :                :          * no difference between an exclusive lock and a cleanup-strength
                               1173                 :                :          * lock.
                               1174                 :                :          */
                               1175   [ +  -  -  + ]:            240 :         if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
   11 tmunro@postgresql.or     1176                 :UBC           0 :             flags |= EB_LOCK_FIRST;
                               1177                 :                : 
   11 tmunro@postgresql.or     1178                 :GNC         240 :         return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
                               1179                 :                :     }
                               1180                 :                : 
                               1181   [ +  +  +  +  :       49639013 :     if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK ||
                                              +  + ]
                               1182                 :                :                  mode == RBM_ZERO_AND_LOCK))
                               1183                 :                :     {
                               1184                 :                :         bool        found;
                               1185                 :                : 
                               1186                 :         265372 :         buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
                               1187                 :                :                                    forkNum, blockNum, strategy, &found);
                               1188                 :         265372 :         ZeroBuffer(buffer, mode);
                               1189                 :         265372 :         return buffer;
                               1190                 :                :     }
                               1191                 :                : 
                               1192         [ +  + ]:       49373641 :     if (mode == RBM_ZERO_ON_ERROR)
                               1193                 :        1204347 :         flags = READ_BUFFERS_ZERO_ON_ERROR;
                               1194                 :                :     else
                               1195                 :       48169294 :         flags = 0;
                               1196                 :       49373641 :     operation.smgr = smgr;
                               1197                 :       49373641 :     operation.rel = rel;
                               1198                 :       49373641 :     operation.smgr_persistence = smgr_persistence;
                               1199                 :       49373641 :     operation.forknum = forkNum;
                               1200                 :       49373641 :     operation.strategy = strategy;
                               1201         [ +  + ]:       49373641 :     if (StartReadBuffer(&operation,
                               1202                 :                :                         &buffer,
                               1203                 :                :                         blockNum,
                               1204                 :                :                         flags))
                               1205                 :         856593 :         WaitReadBuffers(&operation);
                               1206                 :                : 
                               1207                 :       49373625 :     return buffer;
                               1208                 :                : }
                               1209                 :                : 
                               1210                 :                : static pg_attribute_always_inline bool
                               1211                 :       51553537 : StartReadBuffersImpl(ReadBuffersOperation *operation,
                               1212                 :                :                      Buffer *buffers,
                               1213                 :                :                      BlockNumber blockNum,
                               1214                 :                :                      int *nblocks,
                               1215                 :                :                      int flags)
                               1216                 :                : {
                               1217                 :       51553537 :     int         actual_nblocks = *nblocks;
                               1218                 :       51553537 :     int         io_buffers_len = 0;
                               1219                 :                : 
                               1220         [ -  + ]:       51553537 :     Assert(*nblocks > 0);
                               1221         [ -  + ]:       51553537 :     Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
                               1222                 :                : 
                               1223         [ +  + ]:       52701831 :     for (int i = 0; i < actual_nblocks; ++i)
                               1224                 :                :     {
                               1225                 :                :         bool        found;
                               1226                 :                : 
                               1227                 :      103183455 :         buffers[i] = PinBufferForBlock(operation->rel,
                               1228                 :       51591728 :                                        operation->smgr,
                               1229                 :       51591728 :                                        operation->smgr_persistence,
                               1230                 :                :                                        operation->forknum,
                               1231                 :                :                                        blockNum + i,
                               1232                 :                :                                        operation->strategy,
                               1233                 :                :                                        &found);
                               1234                 :                : 
   11 tmunro@postgresql.or     1235         [ +  + ]:CBC    51591727 :         if (found)
                               1236                 :                :         {
                               1237                 :                :             /*
                               1238                 :                :              * Terminate the read as soon as we get a hit.  It could be a
                               1239                 :                :              * single buffer hit, or it could be a hit that follows a readable
                               1240                 :                :              * range.  We don't want to create more than one readable range,
                               1241                 :                :              * so we stop here.
                               1242                 :                :              */
   11 tmunro@postgresql.or     1243                 :GNC    50443433 :             actual_nblocks = i + 1;
                               1244                 :       50443433 :             break;
                               1245                 :                :         }
                               1246                 :                :         else
                               1247                 :                :         {
                               1248                 :                :             /* Extend the readable range to cover this block. */
                               1249                 :        1148294 :             io_buffers_len++;
                               1250                 :                :         }
                               1251                 :                :     }
                               1252                 :       51553536 :     *nblocks = actual_nblocks;
                               1253                 :                : 
                               1254         [ +  + ]:       51553536 :     if (likely(io_buffers_len == 0))
                               1255                 :       50442667 :         return false;
                               1256                 :                : 
                               1257                 :                :     /* Populate information needed for I/O. */
                               1258                 :        1110869 :     operation->buffers = buffers;
                               1259                 :        1110869 :     operation->blocknum = blockNum;
                               1260                 :        1110869 :     operation->flags = flags;
                               1261                 :        1110869 :     operation->nblocks = actual_nblocks;
                               1262                 :        1110869 :     operation->io_buffers_len = io_buffers_len;
                               1263                 :                : 
                               1264         [ +  + ]:        1110869 :     if (flags & READ_BUFFERS_ISSUE_ADVICE)
                               1265                 :                :     {
                               1266                 :                :         /*
                               1267                 :                :          * In theory we should only do this if PinBufferForBlock() had to
                               1268                 :                :          * allocate new buffers above.  That way, if two calls to
                               1269                 :                :          * StartReadBuffers() were made for the same blocks before
                               1270                 :                :          * WaitReadBuffers(), only the first would issue the advice. That'd be
                               1271                 :                :          * a better simulation of true asynchronous I/O, which would only
                               1272                 :                :          * start the I/O once, but isn't done here for simplicity.  Note also
                               1273                 :                :          * that the following call might actually issue two advice calls if we
                               1274                 :                :          * cross a segment boundary; in a true asynchronous version we might
                               1275                 :                :          * choose to process only one real I/O at a time in that case.
                               1276                 :                :          */
                               1277                 :             84 :         smgrprefetch(operation->smgr,
                               1278                 :                :                      operation->forknum,
                               1279                 :                :                      blockNum,
                               1280                 :             84 :                      operation->io_buffers_len);
                               1281                 :                :     }
                               1282                 :                : 
                               1283                 :                :     /* Indicate that WaitReadBuffers() should be called. */
                               1284                 :        1110869 :     return true;
                               1285                 :                : }
                               1286                 :                : 
                               1287                 :                : /*
                               1288                 :                :  * Begin reading a range of blocks beginning at blockNum and extending for
                               1289                 :                :  * *nblocks.  On return, up to *nblocks pinned buffers holding those blocks
                               1290                 :                :  * are written into the buffers array, and *nblocks is updated to contain the
                               1291                 :                :  * actual number, which may be fewer than requested.  Caller sets some of the
                               1292                 :                :  * members of operation; see struct definition.
                               1293                 :                :  *
                               1294                 :                :  * If false is returned, no I/O is necessary.  If true is returned, one I/O
                               1295                 :                :  * has been started, and WaitReadBuffers() must be called with the same
                               1296                 :                :  * operation object before the buffers are accessed.  Along with the operation
                               1297                 :                :  * object, the caller-supplied array of buffers must remain valid until
                               1298                 :                :  * WaitReadBuffers() is called.
                               1299                 :                :  *
                               1300                 :                :  * Currently the I/O is only started with optional operating system advice if
                               1301                 :                :  * requested by the caller with READ_BUFFERS_ISSUE_ADVICE, and the real I/O
                               1302                 :                :  * happens synchronously in WaitReadBuffers().  In future work, true I/O could
                               1303                 :                :  * be initiated here.
                               1304                 :                :  */
                               1305                 :                : bool
                               1306                 :         805176 : StartReadBuffers(ReadBuffersOperation *operation,
                               1307                 :                :                  Buffer *buffers,
                               1308                 :                :                  BlockNumber blockNum,
                               1309                 :                :                  int *nblocks,
                               1310                 :                :                  int flags)
                               1311                 :                : {
                               1312                 :         805176 :     return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags);
                               1313                 :                : }
                               1314                 :                : 
                               1315                 :                : /*
                               1316                 :                :  * Single block version of the StartReadBuffers().  This might save a few
                               1317                 :                :  * instructions when called from another translation unit, because it is
                               1318                 :                :  * specialized for nblocks == 1.
                               1319                 :                :  */
                               1320                 :                : bool
                               1321                 :       50748361 : StartReadBuffer(ReadBuffersOperation *operation,
                               1322                 :                :                 Buffer *buffer,
                               1323                 :                :                 BlockNumber blocknum,
                               1324                 :                :                 int flags)
                               1325                 :                : {
                               1326                 :       50748361 :     int         nblocks = 1;
                               1327                 :                :     bool        result;
                               1328                 :                : 
                               1329                 :       50748361 :     result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags);
                               1330         [ -  + ]:       50748360 :     Assert(nblocks == 1);       /* single block can't be short */
                               1331                 :                : 
                               1332                 :       50748360 :     return result;
                               1333                 :                : }
                               1334                 :                : 
                               1335                 :                : static inline bool
                               1336                 :        1148290 : WaitReadBuffersCanStartIO(Buffer buffer, bool nowait)
                               1337                 :                : {
                               1338         [ +  + ]:        1148290 :     if (BufferIsLocal(buffer))
                               1339                 :                :     {
                               1340                 :           3799 :         BufferDesc *bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               1341                 :                : 
                               1342                 :           3799 :         return (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
                               1343                 :                :     }
                               1344                 :                :     else
                               1345                 :        1144491 :         return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
                               1346                 :                : }
                               1347                 :                : 
                               1348                 :                : void
                               1349                 :        1110865 : WaitReadBuffers(ReadBuffersOperation *operation)
                               1350                 :                : {
                               1351                 :                :     Buffer     *buffers;
                               1352                 :                :     int         nblocks;
                               1353                 :                :     BlockNumber blocknum;
                               1354                 :                :     ForkNumber  forknum;
                               1355                 :                :     IOContext   io_context;
                               1356                 :                :     IOObject    io_object;
                               1357                 :                :     char        persistence;
                               1358                 :                : 
                               1359                 :                :     /*
                               1360                 :                :      * Currently operations are only allowed to include a read of some range,
                               1361                 :                :      * with an optional extra buffer that is already pinned at the end.  So
                               1362                 :                :      * nblocks can be at most one more than io_buffers_len.
                               1363                 :                :      */
                               1364   [ +  +  -  + ]:        1110865 :     Assert((operation->nblocks == operation->io_buffers_len) ||
                               1365                 :                :            (operation->nblocks == operation->io_buffers_len + 1));
                               1366                 :                : 
                               1367                 :                :     /* Find the range of the physical read we need to perform. */
                               1368                 :        1110865 :     nblocks = operation->io_buffers_len;
                               1369         [ -  + ]:        1110865 :     if (nblocks == 0)
   11 tmunro@postgresql.or     1370                 :UNC           0 :         return;                 /* nothing to do */
                               1371                 :                : 
   11 tmunro@postgresql.or     1372                 :GNC     1110865 :     buffers = &operation->buffers[0];
                               1373                 :        1110865 :     blocknum = operation->blocknum;
                               1374                 :        1110865 :     forknum = operation->forknum;
                               1375                 :                : 
                               1376         [ +  + ]:        1110865 :     persistence = operation->rel
                               1377                 :         921102 :         ? operation->rel->rd_rel->relpersistence
                               1378                 :                :         : RELPERSISTENCE_PERMANENT;
                               1379         [ +  + ]:        1110865 :     if (persistence == RELPERSISTENCE_TEMP)
                               1380                 :                :     {
                               1381                 :            787 :         io_context = IOCONTEXT_NORMAL;
                               1382                 :            787 :         io_object = IOOBJECT_TEMP_RELATION;
                               1383                 :                :     }
                               1384                 :                :     else
                               1385                 :                :     {
                               1386                 :        1110078 :         io_context = IOContextForStrategy(operation->strategy);
                               1387                 :        1110078 :         io_object = IOOBJECT_RELATION;
                               1388                 :                :     }
                               1389                 :                : 
                               1390                 :                :     /*
                               1391                 :                :      * We count all these blocks as read by this backend.  This is traditional
                               1392                 :                :      * behavior, but might turn out to be not true if we find that someone
                               1393                 :                :      * else has beaten us and completed the read of some of these blocks.  In
                               1394                 :                :      * that case the system globally double-counts, but we traditionally don't
                               1395                 :                :      * count this as a "hit", and we don't have a separate counter for "miss,
                               1396                 :                :      * but another backend completed the read".
                               1397                 :                :      */
                               1398         [ +  + ]:        1110865 :     if (persistence == RELPERSISTENCE_TEMP)
                               1399                 :            787 :         pgBufferUsage.local_blks_read += nblocks;
                               1400                 :                :     else
                               1401                 :        1110078 :         pgBufferUsage.shared_blks_read += nblocks;
                               1402                 :                : 
                               1403         [ +  + ]:        2221715 :     for (int i = 0; i < nblocks; ++i)
                               1404                 :                :     {
                               1405                 :                :         int         io_buffers_len;
                               1406                 :                :         Buffer      io_buffers[MAX_IO_COMBINE_LIMIT];
                               1407                 :                :         void       *io_pages[MAX_IO_COMBINE_LIMIT];
                               1408                 :                :         instr_time  io_start;
                               1409                 :                :         BlockNumber io_first_block;
                               1410                 :                : 
                               1411                 :                :         /*
                               1412                 :                :          * Skip this block if someone else has already completed it.  If an
                               1413                 :                :          * I/O is already in progress in another backend, this will wait for
                               1414                 :                :          * the outcome: either done, or something went wrong and we will
                               1415                 :                :          * retry.
                               1416                 :                :          */
                               1417         [ +  + ]:        1110865 :         if (!WaitReadBuffersCanStartIO(buffers[i], false))
                               1418                 :                :         {
                               1419                 :                :             /*
                               1420                 :                :              * Report this as a 'hit' for this backend, even though it must
                               1421                 :                :              * have started out as a miss in PinBufferForBlock().
                               1422                 :                :              */
                               1423                 :                :             TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + i,
                               1424                 :                :                                               operation->smgr->smgr_rlocator.locator.spcOid,
                               1425                 :                :                                               operation->smgr->smgr_rlocator.locator.dbOid,
                               1426                 :                :                                               operation->smgr->smgr_rlocator.locator.relNumber,
                               1427                 :                :                                               operation->smgr->smgr_rlocator.backend,
                               1428                 :                :                                               true);
                               1429                 :           5935 :             continue;
                               1430                 :                :         }
                               1431                 :                : 
                               1432                 :                :         /* We found a buffer that we need to read in. */
                               1433                 :        1104930 :         io_buffers[0] = buffers[i];
                               1434                 :        1104930 :         io_pages[0] = BufferGetBlock(buffers[i]);
                               1435                 :        1104930 :         io_first_block = blocknum + i;
                               1436                 :        1104930 :         io_buffers_len = 1;
                               1437                 :                : 
                               1438                 :                :         /*
                               1439                 :                :          * How many neighboring-on-disk blocks can we can scatter-read into
                               1440                 :                :          * other buffers at the same time?  In this case we don't wait if we
                               1441                 :                :          * see an I/O already in progress.  We already hold BM_IO_IN_PROGRESS
                               1442                 :                :          * for the head block, so we should get on with that I/O as soon as
                               1443                 :                :          * possible.  We'll come back to this block again, above.
                               1444                 :                :          */
                               1445   [ +  +  +  - ]:        1179780 :         while ((i + 1) < nblocks &&
                               1446                 :          37425 :                WaitReadBuffersCanStartIO(buffers[i + 1], true))
                               1447                 :                :         {
                               1448                 :                :             /* Must be consecutive block numbers. */
                               1449         [ -  + ]:          37425 :             Assert(BufferGetBlockNumber(buffers[i + 1]) ==
                               1450                 :                :                    BufferGetBlockNumber(buffers[i]) + 1);
                               1451                 :                : 
                               1452                 :          37425 :             io_buffers[io_buffers_len] = buffers[++i];
                               1453                 :          37425 :             io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
                               1454                 :                :         }
                               1455                 :                : 
                               1456                 :        1104930 :         io_start = pgstat_prepare_io_time(track_io_timing);
                               1457                 :        1104930 :         smgrreadv(operation->smgr, forknum, io_first_block, io_pages, io_buffers_len);
                               1458                 :        1104915 :         pgstat_count_io_op_time(io_object, io_context, IOOP_READ, io_start,
                               1459                 :                :                                 io_buffers_len);
                               1460                 :                : 
                               1461                 :                :         /* Verify each block we read, and terminate the I/O. */
                               1462         [ +  + ]:        2247255 :         for (int j = 0; j < io_buffers_len; ++j)
                               1463                 :                :         {
                               1464                 :                :             BufferDesc *bufHdr;
                               1465                 :                :             Block       bufBlock;
                               1466                 :                : 
                               1467         [ +  + ]:        1142340 :             if (persistence == RELPERSISTENCE_TEMP)
                               1468                 :                :             {
                               1469                 :           3799 :                 bufHdr = GetLocalBufferDescriptor(-io_buffers[j] - 1);
                               1470                 :           3799 :                 bufBlock = LocalBufHdrGetBlock(bufHdr);
                               1471                 :                :             }
                               1472                 :                :             else
                               1473                 :                :             {
                               1474                 :        1138541 :                 bufHdr = GetBufferDescriptor(io_buffers[j] - 1);
                               1475                 :        1138541 :                 bufBlock = BufHdrGetBlock(bufHdr);
                               1476                 :                :             }
                               1477                 :                : 
                               1478                 :                :             /* check for garbage data */
                               1479         [ -  + ]:        1142340 :             if (!PageIsVerifiedExtended((Page) bufBlock, io_first_block + j,
                               1480                 :                :                                         PIV_LOG_WARNING | PIV_REPORT_STAT))
                               1481                 :                :             {
   11 tmunro@postgresql.or     1482   [ #  #  #  # ]:UNC           0 :                 if ((operation->flags & READ_BUFFERS_ZERO_ON_ERROR) || zero_damaged_pages)
                               1483                 :                :                 {
                               1484         [ #  # ]:              0 :                     ereport(WARNING,
                               1485                 :                :                             (errcode(ERRCODE_DATA_CORRUPTED),
                               1486                 :                :                              errmsg("invalid page in block %u of relation %s; zeroing out page",
                               1487                 :                :                                     io_first_block + j,
                               1488                 :                :                                     relpath(operation->smgr->smgr_rlocator, forknum))));
                               1489                 :              0 :                     memset(bufBlock, 0, BLCKSZ);
                               1490                 :                :                 }
                               1491                 :                :                 else
                               1492         [ #  # ]:              0 :                     ereport(ERROR,
                               1493                 :                :                             (errcode(ERRCODE_DATA_CORRUPTED),
                               1494                 :                :                              errmsg("invalid page in block %u of relation %s",
                               1495                 :                :                                     io_first_block + j,
                               1496                 :                :                                     relpath(operation->smgr->smgr_rlocator, forknum))));
                               1497                 :                :             }
                               1498                 :                : 
                               1499                 :                :             /* Terminate I/O and set BM_VALID. */
   11 tmunro@postgresql.or     1500         [ +  + ]:GNC     1142340 :             if (persistence == RELPERSISTENCE_TEMP)
                               1501                 :                :             {
                               1502                 :           3799 :                 uint32      buf_state = pg_atomic_read_u32(&bufHdr->state);
                               1503                 :                : 
                               1504                 :           3799 :                 buf_state |= BM_VALID;
                               1505                 :           3799 :                 pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                               1506                 :                :             }
                               1507                 :                :             else
                               1508                 :                :             {
                               1509                 :                :                 /* Set BM_VALID, terminate IO, and wake up any waiters */
                               1510                 :        1138541 :                 TerminateBufferIO(bufHdr, false, BM_VALID, true);
                               1511                 :                :             }
                               1512                 :                : 
                               1513                 :                :             /* Report I/Os as completing individually. */
                               1514                 :                :             TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, io_first_block + j,
                               1515                 :                :                                               operation->smgr->smgr_rlocator.locator.spcOid,
                               1516                 :                :                                               operation->smgr->smgr_rlocator.locator.dbOid,
                               1517                 :                :                                               operation->smgr->smgr_rlocator.locator.relNumber,
                               1518                 :                :                                               operation->smgr->smgr_rlocator.backend,
                               1519                 :                :                                               false);
                               1520                 :                :         }
                               1521                 :                : 
                               1522                 :        1104915 :         VacuumPageMiss += io_buffers_len;
                               1523         [ +  + ]:        1104915 :         if (VacuumCostActive)
                               1524                 :          15545 :             VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
                               1525                 :                :     }
                               1526                 :                : }
                               1527                 :                : 
                               1528                 :                : /*
                               1529                 :                :  * BufferAlloc -- subroutine for PinBufferForBlock.  Handles lookup of a shared
                               1530                 :                :  *      buffer.  If no buffer exists already, selects a replacement victim and
                               1531                 :                :  *      evicts the old page, but does NOT read in new page.
                               1532                 :                :  *
                               1533                 :                :  * "strategy" can be a buffer replacement strategy object, or NULL for
                               1534                 :                :  * the default strategy.  The selected buffer's usage_count is advanced when
                               1535                 :                :  * using the default strategy, but otherwise possibly not (see PinBuffer).
                               1536                 :                :  *
                               1537                 :                :  * The returned buffer is pinned and is already marked as holding the
                               1538                 :                :  * desired page.  If it already did have the desired page, *foundPtr is
                               1539                 :                :  * set true.  Otherwise, *foundPtr is set false.
                               1540                 :                :  *
                               1541                 :                :  * io_context is passed as an output parameter to avoid calling
                               1542                 :                :  * IOContextForStrategy() when there is a shared buffers hit and no IO
                               1543                 :                :  * statistics need be captured.
                               1544                 :                :  *
                               1545                 :                :  * No locks are held either at entry or exit.
                               1546                 :                :  */
                               1547                 :                : static pg_attribute_always_inline BufferDesc *
 4855 rhaas@postgresql.org     1548                 :CBC    50602896 : BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                               1549                 :                :             BlockNumber blockNum,
                               1550                 :                :             BufferAccessStrategy strategy,
                               1551                 :                :             bool *foundPtr, IOContext io_context)
                               1552                 :                : {
                               1553                 :                :     BufferTag   newTag;         /* identity of requested block */
                               1554                 :                :     uint32      newHash;        /* hash value for newTag */
                               1555                 :                :     LWLock     *newPartitionLock;   /* buffer partition lock for it */
                               1556                 :                :     int         existing_buf_id;
                               1557                 :                :     Buffer      victim_buffer;
                               1558                 :                :     BufferDesc *victim_buf_hdr;
                               1559                 :                :     uint32      victim_buf_state;
                               1560                 :                : 
                               1561                 :                :     /* Make sure we will have room to remember the buffer pin */
  158 heikki.linnakangas@i     1562                 :GNC    50602896 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               1563                 :       50602896 :     ReservePrivateRefCountEntry();
                               1564                 :                : 
                               1565                 :                :     /* create a tag so we can lookup the buffer */
  627 rhaas@postgresql.org     1566                 :CBC    50602896 :     InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
                               1567                 :                : 
                               1568                 :                :     /* determine its hash code and partition lock ID */
 6475 tgl@sss.pgh.pa.us        1569                 :       50602896 :     newHash = BufTableHashCode(&newTag);
                               1570                 :       50602896 :     newPartitionLock = BufMappingPartitionLock(newHash);
                               1571                 :                : 
                               1572                 :                :     /* see if the block is in the buffer pool already */
                               1573                 :       50602896 :     LWLockAcquire(newPartitionLock, LW_SHARED);
  375 andres@anarazel.de       1574                 :       50602895 :     existing_buf_id = BufTableLookup(&newTag, newHash);
                               1575         [ +  + ]:       50602895 :     if (existing_buf_id >= 0)
                               1576                 :                :     {
                               1577                 :                :         BufferDesc *buf;
                               1578                 :                :         bool        valid;
                               1579                 :                : 
                               1580                 :                :         /*
                               1581                 :                :          * Found it.  Now, pin the buffer so no one can steal it from the
                               1582                 :                :          * buffer pool, and check to see if the correct data has been loaded
                               1583                 :                :          * into the buffer.
                               1584                 :                :          */
                               1585                 :       49226458 :         buf = GetBufferDescriptor(existing_buf_id);
                               1586                 :                : 
 6164 tgl@sss.pgh.pa.us        1587                 :       49226458 :         valid = PinBuffer(buf, strategy);
                               1588                 :                : 
                               1589                 :                :         /* Can release the mapping lock as soon as we've pinned it */
 6475                          1590                 :       49226458 :         LWLockRelease(newPartitionLock);
                               1591                 :                : 
 2433 peter_e@gmx.net          1592                 :       49226458 :         *foundPtr = true;
                               1593                 :                : 
 6981 tgl@sss.pgh.pa.us        1594         [ +  + ]:       49226458 :         if (!valid)
                               1595                 :                :         {
                               1596                 :                :             /*
                               1597                 :                :              * We can only get here if (a) someone else is still reading in
                               1598                 :                :              * the page, (b) a previous read attempt failed, or (c) someone
                               1599                 :                :              * called StartReadBuffers() but not yet WaitReadBuffers().
                               1600                 :                :              */
   11 tmunro@postgresql.or     1601                 :GNC        3986 :             *foundPtr = false;
                               1602                 :                :         }
                               1603                 :                : 
 9357 bruce@momjian.us         1604                 :CBC    49226458 :         return buf;
                               1605                 :                :     }
                               1606                 :                : 
                               1607                 :                :     /*
                               1608                 :                :      * Didn't find it in the buffer pool.  We'll have to initialize a new
                               1609                 :                :      * buffer.  Remember to unlock the mapping lock while doing the work.
                               1610                 :                :      */
 6475 tgl@sss.pgh.pa.us        1611                 :        1376437 :     LWLockRelease(newPartitionLock);
                               1612                 :                : 
                               1613                 :                :     /*
                               1614                 :                :      * Acquire a victim buffer. Somebody else might try to do the same, we
                               1615                 :                :      * don't hold any conflicting locks. If so we'll have to undo our work
                               1616                 :                :      * later.
                               1617                 :                :      */
  375 andres@anarazel.de       1618                 :        1376437 :     victim_buffer = GetVictimBuffer(strategy, io_context);
                               1619                 :        1376437 :     victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
                               1620                 :                : 
                               1621                 :                :     /*
                               1622                 :                :      * Try to make a hashtable entry for the buffer under its new tag. If
                               1623                 :                :      * somebody else inserted another buffer for the tag, we'll release the
                               1624                 :                :      * victim buffer we acquired and use the already inserted one.
                               1625                 :                :      */
                               1626                 :        1376437 :     LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
                               1627                 :        1376437 :     existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
                               1628         [ +  + ]:        1376437 :     if (existing_buf_id >= 0)
                               1629                 :                :     {
                               1630                 :                :         BufferDesc *existing_buf_hdr;
                               1631                 :                :         bool        valid;
                               1632                 :                : 
                               1633                 :                :         /*
                               1634                 :                :          * Got a collision. Someone has already done what we were about to do.
                               1635                 :                :          * We'll just handle this as if it were found in the buffer pool in
                               1636                 :                :          * the first place.  First, give up the buffer we were planning to
                               1637                 :                :          * use.
                               1638                 :                :          *
                               1639                 :                :          * We could do this after releasing the partition lock, but then we'd
                               1640                 :                :          * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
                               1641                 :                :          * before acquiring the lock, for the rare case of such a collision.
                               1642                 :                :          */
                               1643                 :           2718 :         UnpinBuffer(victim_buf_hdr);
                               1644                 :                : 
                               1645                 :                :         /*
                               1646                 :                :          * The victim buffer we acquired previously is clean and unused, let
                               1647                 :                :          * it be found again quickly
                               1648                 :                :          */
                               1649                 :           2718 :         StrategyFreeBuffer(victim_buf_hdr);
                               1650                 :                : 
                               1651                 :                :         /* remaining code should match code at top of routine */
                               1652                 :                : 
                               1653                 :           2718 :         existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
                               1654                 :                : 
                               1655                 :           2718 :         valid = PinBuffer(existing_buf_hdr, strategy);
                               1656                 :                : 
                               1657                 :                :         /* Can release the mapping lock as soon as we've pinned it */
                               1658                 :           2718 :         LWLockRelease(newPartitionLock);
                               1659                 :                : 
                               1660                 :           2718 :         *foundPtr = true;
                               1661                 :                : 
                               1662         [ +  + ]:           2718 :         if (!valid)
                               1663                 :                :         {
                               1664                 :                :             /*
                               1665                 :                :              * We can only get here if (a) someone else is still reading in
                               1666                 :                :              * the page, (b) a previous read attempt failed, or (c) someone
                               1667                 :                :              * called StartReadBuffers() but not yet WaitReadBuffers().
                               1668                 :                :              */
   11 tmunro@postgresql.or     1669                 :GNC        1964 :             *foundPtr = false;
                               1670                 :                :         }
                               1671                 :                : 
  375 andres@anarazel.de       1672                 :CBC        2718 :         return existing_buf_hdr;
                               1673                 :                :     }
                               1674                 :                : 
                               1675                 :                :     /*
                               1676                 :                :      * Need to lock the buffer header too in order to change its tag.
                               1677                 :                :      */
                               1678                 :        1373719 :     victim_buf_state = LockBufHdr(victim_buf_hdr);
                               1679                 :                : 
                               1680                 :                :     /* some sanity checks while we hold the buffer header lock */
                               1681         [ -  + ]:        1373719 :     Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
                               1682         [ -  + ]:        1373719 :     Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
                               1683                 :                : 
                               1684                 :        1373719 :     victim_buf_hdr->tag = newTag;
                               1685                 :                : 
                               1686                 :                :     /*
                               1687                 :                :      * Make sure BM_PERMANENT is set for buffers that must be written at every
                               1688                 :                :      * checkpoint.  Unlogged buffers only need to be written at shutdown
                               1689                 :                :      * checkpoints, except for their "init" forks, which need to be treated
                               1690                 :                :      * just like permanent relations.
                               1691                 :                :      */
                               1692                 :        1373719 :     victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
 2588 rhaas@postgresql.org     1693   [ +  +  -  + ]:        1373719 :     if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
  375 andres@anarazel.de       1694                 :        1373674 :         victim_buf_state |= BM_PERMANENT;
                               1695                 :                : 
                               1696                 :        1373719 :     UnlockBufHdr(victim_buf_hdr, victim_buf_state);
                               1697                 :                : 
 6475 tgl@sss.pgh.pa.us        1698                 :        1373719 :     LWLockRelease(newPartitionLock);
                               1699                 :                : 
                               1700                 :                :     /*
                               1701                 :                :      * Buffer contents are currently invalid.
                               1702                 :                :      */
   11 tmunro@postgresql.or     1703                 :GNC     1373719 :     *foundPtr = false;
                               1704                 :                : 
  375 andres@anarazel.de       1705                 :CBC     1373719 :     return victim_buf_hdr;
                               1706                 :                : }
                               1707                 :                : 
                               1708                 :                : /*
                               1709                 :                :  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
                               1710                 :                :  * freelist.
                               1711                 :                :  *
                               1712                 :                :  * The buffer header spinlock must be held at entry.  We drop it before
                               1713                 :                :  * returning.  (This is sane because the caller must have locked the
                               1714                 :                :  * buffer in order to be sure it should be dropped.)
                               1715                 :                :  *
                               1716                 :                :  * This is used only in contexts such as dropping a relation.  We assume
                               1717                 :                :  * that no other backend could possibly be interested in using the page,
                               1718                 :                :  * so the only reason the buffer might be pinned is if someone else is
                               1719                 :                :  * trying to write it out.  We have to let them finish before we can
                               1720                 :                :  * reclaim the buffer.
                               1721                 :                :  *
                               1722                 :                :  * The buffer could get reclaimed by someone else while we are waiting
                               1723                 :                :  * to acquire the necessary locks; if so, don't mess it up.
                               1724                 :                :  */
                               1725                 :                : static void
 3072 rhaas@postgresql.org     1726                 :          93608 : InvalidateBuffer(BufferDesc *buf)
                               1727                 :                : {
                               1728                 :                :     BufferTag   oldTag;
                               1729                 :                :     uint32      oldHash;        /* hash value for oldTag */
                               1730                 :                :     LWLock     *oldPartitionLock;   /* buffer partition lock for it */
                               1731                 :                :     uint32      oldFlags;
                               1732                 :                :     uint32      buf_state;
                               1733                 :                : 
                               1734                 :                :     /* Save the original buffer tag before dropping the spinlock */
 6981 tgl@sss.pgh.pa.us        1735                 :          93608 :     oldTag = buf->tag;
                               1736                 :                : 
 2926 andres@anarazel.de       1737                 :          93608 :     buf_state = pg_atomic_read_u32(&buf->state);
                               1738         [ -  + ]:          93608 :     Assert(buf_state & BM_LOCKED);
                               1739                 :          93608 :     UnlockBufHdr(buf, buf_state);
                               1740                 :                : 
                               1741                 :                :     /*
                               1742                 :                :      * Need to compute the old tag's hashcode and partition lock ID. XXX is it
                               1743                 :                :      * worth storing the hashcode in BufferDesc so we need not recompute it
                               1744                 :                :      * here?  Probably not.
                               1745                 :                :      */
 6475 tgl@sss.pgh.pa.us        1746                 :          93608 :     oldHash = BufTableHashCode(&oldTag);
                               1747                 :          93608 :     oldPartitionLock = BufMappingPartitionLock(oldHash);
                               1748                 :                : 
 6981                          1749                 :          93609 : retry:
                               1750                 :                : 
                               1751                 :                :     /*
                               1752                 :                :      * Acquire exclusive mapping lock in preparation for changing the buffer's
                               1753                 :                :      * association.
                               1754                 :                :      */
 6475                          1755                 :          93609 :     LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
                               1756                 :                : 
                               1757                 :                :     /* Re-lock the buffer header */
 2926 andres@anarazel.de       1758                 :          93609 :     buf_state = LockBufHdr(buf);
                               1759                 :                : 
                               1760                 :                :     /* If it's changed while we were waiting for lock, do nothing */
  627 rhaas@postgresql.org     1761         [ +  + ]:          93609 :     if (!BufferTagsEqual(&buf->tag, &oldTag))
                               1762                 :                :     {
 2926 andres@anarazel.de       1763                 :              1 :         UnlockBufHdr(buf, buf_state);
 6475 tgl@sss.pgh.pa.us        1764                 :              1 :         LWLockRelease(oldPartitionLock);
 6981                          1765                 :              1 :         return;
                               1766                 :                :     }
                               1767                 :                : 
                               1768                 :                :     /*
                               1769                 :                :      * We assume the only reason for it to be pinned is that someone else is
                               1770                 :                :      * flushing the page out.  Wait for them to finish.  (This could be an
                               1771                 :                :      * infinite loop if the refcount is messed up... it would be nice to time
                               1772                 :                :      * out after awhile, but there seems no way to be sure how many loops may
                               1773                 :                :      * be needed.  Note that if the other guy has pinned the buffer but not
                               1774                 :                :      * yet done StartBufferIO, WaitIO will fall through and we'll effectively
                               1775                 :                :      * be busy-looping here.)
                               1776                 :                :      */
 2926 andres@anarazel.de       1777         [ +  + ]:          93608 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
                               1778                 :                :     {
                               1779                 :              1 :         UnlockBufHdr(buf, buf_state);
 6475 tgl@sss.pgh.pa.us        1780                 :              1 :         LWLockRelease(oldPartitionLock);
                               1781                 :                :         /* safety check: should definitely not be our *own* pin */
 3168 andres@anarazel.de       1782         [ -  + ]:              1 :         if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
 6967 tgl@sss.pgh.pa.us        1783         [ #  # ]:UBC           0 :             elog(ERROR, "buffer is pinned in InvalidateBuffer");
 6981 tgl@sss.pgh.pa.us        1784                 :CBC           1 :         WaitIO(buf);
                               1785                 :              1 :         goto retry;
                               1786                 :                :     }
                               1787                 :                : 
                               1788                 :                :     /*
                               1789                 :                :      * Clear out the buffer's tag and flags.  We must do this to ensure that
                               1790                 :                :      * linear scans of the buffer array don't think the buffer is valid.
                               1791                 :                :      */
 2926 andres@anarazel.de       1792                 :          93607 :     oldFlags = buf_state & BUF_FLAG_MASK;
  627 rhaas@postgresql.org     1793                 :          93607 :     ClearBufferTag(&buf->tag);
 2926 andres@anarazel.de       1794                 :          93607 :     buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
                               1795                 :          93607 :     UnlockBufHdr(buf, buf_state);
                               1796                 :                : 
                               1797                 :                :     /*
                               1798                 :                :      * Remove the buffer from the lookup hashtable, if it was in there.
                               1799                 :                :      */
 6981 tgl@sss.pgh.pa.us        1800         [ +  - ]:          93607 :     if (oldFlags & BM_TAG_VALID)
 6475                          1801                 :          93607 :         BufTableDelete(&oldTag, oldHash);
                               1802                 :                : 
                               1803                 :                :     /*
                               1804                 :                :      * Done with mapping lock.
                               1805                 :                :      */
                               1806                 :          93607 :     LWLockRelease(oldPartitionLock);
                               1807                 :                : 
                               1808                 :                :     /*
                               1809                 :                :      * Insert the buffer at the head of the list of free buffers.
                               1810                 :                :      */
 6164                          1811                 :          93607 :     StrategyFreeBuffer(buf);
                               1812                 :                : }
                               1813                 :                : 
                               1814                 :                : /*
                               1815                 :                :  * Helper routine for GetVictimBuffer()
                               1816                 :                :  *
                               1817                 :                :  * Needs to be called on a buffer with a valid tag, pinned, but without the
                               1818                 :                :  * buffer header spinlock held.
                               1819                 :                :  *
                               1820                 :                :  * Returns true if the buffer can be reused, in which case the buffer is only
                               1821                 :                :  * pinned by this backend and marked as invalid, false otherwise.
                               1822                 :                :  */
                               1823                 :                : static bool
  375 andres@anarazel.de       1824                 :        1003491 : InvalidateVictimBuffer(BufferDesc *buf_hdr)
                               1825                 :                : {
                               1826                 :                :     uint32      buf_state;
                               1827                 :                :     uint32      hash;
                               1828                 :                :     LWLock     *partition_lock;
                               1829                 :                :     BufferTag   tag;
                               1830                 :                : 
                               1831         [ -  + ]:        1003491 :     Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
                               1832                 :                : 
                               1833                 :                :     /* have buffer pinned, so it's safe to read tag without lock */
                               1834                 :        1003491 :     tag = buf_hdr->tag;
                               1835                 :                : 
                               1836                 :        1003491 :     hash = BufTableHashCode(&tag);
                               1837                 :        1003491 :     partition_lock = BufMappingPartitionLock(hash);
                               1838                 :                : 
                               1839                 :        1003491 :     LWLockAcquire(partition_lock, LW_EXCLUSIVE);
                               1840                 :                : 
                               1841                 :                :     /* lock the buffer header */
                               1842                 :        1003491 :     buf_state = LockBufHdr(buf_hdr);
                               1843                 :                : 
                               1844                 :                :     /*
                               1845                 :                :      * We have the buffer pinned nobody else should have been able to unset
                               1846                 :                :      * this concurrently.
                               1847                 :                :      */
                               1848         [ -  + ]:        1003491 :     Assert(buf_state & BM_TAG_VALID);
                               1849         [ -  + ]:        1003491 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               1850         [ -  + ]:        1003491 :     Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
                               1851                 :                : 
                               1852                 :                :     /*
                               1853                 :                :      * If somebody else pinned the buffer since, or even worse, dirtied it,
                               1854                 :                :      * give up on this buffer: It's clearly in use.
                               1855                 :                :      */
                               1856   [ +  +  +  + ]:        1003491 :     if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
                               1857                 :                :     {
                               1858         [ -  + ]:            278 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               1859                 :                : 
                               1860                 :            278 :         UnlockBufHdr(buf_hdr, buf_state);
                               1861                 :            278 :         LWLockRelease(partition_lock);
                               1862                 :                : 
                               1863                 :            278 :         return false;
                               1864                 :                :     }
                               1865                 :                : 
                               1866                 :                :     /*
                               1867                 :                :      * Clear out the buffer's tag and flags and usagecount.  This is not
                               1868                 :                :      * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
                               1869                 :                :      * doing anything with the buffer. But currently it's beneficial, as the
                               1870                 :                :      * cheaper pre-check for several linear scans of shared buffers use the
                               1871                 :                :      * tag (see e.g. FlushDatabaseBuffers()).
                               1872                 :                :      */
                               1873                 :        1003213 :     ClearBufferTag(&buf_hdr->tag);
                               1874                 :        1003213 :     buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
                               1875                 :        1003213 :     UnlockBufHdr(buf_hdr, buf_state);
                               1876                 :                : 
                               1877         [ -  + ]:        1003213 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               1878                 :                : 
                               1879                 :                :     /* finally delete buffer from the buffer mapping table */
                               1880                 :        1003213 :     BufTableDelete(&tag, hash);
                               1881                 :                : 
                               1882                 :        1003213 :     LWLockRelease(partition_lock);
                               1883                 :                : 
                               1884         [ -  + ]:        1003213 :     Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
                               1885         [ -  + ]:        1003213 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               1886         [ -  + ]:        1003213 :     Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
                               1887                 :                : 
                               1888                 :        1003213 :     return true;
                               1889                 :                : }
                               1890                 :                : 
                               1891                 :                : static Buffer
                               1892                 :        1582759 : GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
                               1893                 :                : {
                               1894                 :                :     BufferDesc *buf_hdr;
                               1895                 :                :     Buffer      buf;
                               1896                 :                :     uint32      buf_state;
                               1897                 :                :     bool        from_ring;
                               1898                 :                : 
                               1899                 :                :     /*
                               1900                 :                :      * Ensure, while the spinlock's not yet held, that there's a free refcount
                               1901                 :                :      * entry, and a resource owner slot for the pin.
                               1902                 :                :      */
                               1903                 :        1582759 :     ReservePrivateRefCountEntry();
  158 heikki.linnakangas@i     1904                 :GNC     1582759 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               1905                 :                : 
                               1906                 :                :     /* we return here if a prospective victim buffer gets used concurrently */
  375 andres@anarazel.de       1907                 :CBC        4398 : again:
                               1908                 :                : 
                               1909                 :                :     /*
                               1910                 :                :      * Select a victim buffer.  The buffer is returned with its header
                               1911                 :                :      * spinlock still held!
                               1912                 :                :      */
                               1913                 :        1587157 :     buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
                               1914                 :        1587157 :     buf = BufferDescriptorGetBuffer(buf_hdr);
                               1915                 :                : 
                               1916         [ -  + ]:        1587157 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
                               1917                 :                : 
                               1918                 :                :     /* Pin the buffer and then release the buffer spinlock */
                               1919                 :        1587157 :     PinBuffer_Locked(buf_hdr);
                               1920                 :                : 
                               1921                 :                :     /*
                               1922                 :                :      * We shouldn't have any other pins for this buffer.
                               1923                 :                :      */
                               1924                 :        1587157 :     CheckBufferIsPinnedOnce(buf);
                               1925                 :                : 
                               1926                 :                :     /*
                               1927                 :                :      * If the buffer was dirty, try to write it out.  There is a race
                               1928                 :                :      * condition here, in that someone might dirty it after we released the
                               1929                 :                :      * buffer header lock above, or even while we are writing it out (since
                               1930                 :                :      * our share-lock won't prevent hint-bit updates).  We will recheck the
                               1931                 :                :      * dirty bit after re-locking the buffer header.
                               1932                 :                :      */
                               1933         [ +  + ]:        1587157 :     if (buf_state & BM_DIRTY)
                               1934                 :                :     {
                               1935                 :                :         LWLock     *content_lock;
                               1936                 :                : 
                               1937         [ -  + ]:         256301 :         Assert(buf_state & BM_TAG_VALID);
                               1938         [ -  + ]:         256301 :         Assert(buf_state & BM_VALID);
                               1939                 :                : 
                               1940                 :                :         /*
                               1941                 :                :          * We need a share-lock on the buffer contents to write it out (else
                               1942                 :                :          * we might write invalid data, eg because someone else is compacting
                               1943                 :                :          * the page contents while we write).  We must use a conditional lock
                               1944                 :                :          * acquisition here to avoid deadlock.  Even though the buffer was not
                               1945                 :                :          * pinned (and therefore surely not locked) when StrategyGetBuffer
                               1946                 :                :          * returned it, someone else could have pinned and exclusive-locked it
                               1947                 :                :          * by the time we get here. If we try to get the lock unconditionally,
                               1948                 :                :          * we'd block waiting for them; if they later block waiting for us,
                               1949                 :                :          * deadlock ensues. (This has been observed to happen when two
                               1950                 :                :          * backends are both trying to split btree index pages, and the second
                               1951                 :                :          * one just happens to be trying to split the page the first one got
                               1952                 :                :          * from StrategyGetBuffer.)
                               1953                 :                :          */
                               1954                 :         256301 :         content_lock = BufferDescriptorGetContentLock(buf_hdr);
                               1955         [ -  + ]:         256301 :         if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
                               1956                 :                :         {
                               1957                 :                :             /*
                               1958                 :                :              * Someone else has locked the buffer, so give it up and loop back
                               1959                 :                :              * to get another one.
                               1960                 :                :              */
  375 andres@anarazel.de       1961                 :UBC           0 :             UnpinBuffer(buf_hdr);
                               1962                 :              0 :             goto again;
                               1963                 :                :         }
                               1964                 :                : 
                               1965                 :                :         /*
                               1966                 :                :          * If using a nondefault strategy, and writing the buffer would
                               1967                 :                :          * require a WAL flush, let the strategy decide whether to go ahead
                               1968                 :                :          * and write/reuse the buffer or to choose another victim.  We need a
                               1969                 :                :          * lock to inspect the page LSN, so this can't be done inside
                               1970                 :                :          * StrategyGetBuffer.
                               1971                 :                :          */
  375 andres@anarazel.de       1972         [ +  + ]:CBC      256301 :         if (strategy != NULL)
                               1973                 :                :         {
                               1974                 :                :             XLogRecPtr  lsn;
                               1975                 :                : 
                               1976                 :                :             /* Read the LSN while holding buffer header lock */
                               1977                 :          62421 :             buf_state = LockBufHdr(buf_hdr);
                               1978                 :          62421 :             lsn = BufferGetLSN(buf_hdr);
                               1979                 :          62421 :             UnlockBufHdr(buf_hdr, buf_state);
                               1980                 :                : 
                               1981         [ +  + ]:          62421 :             if (XLogNeedsFlush(lsn)
                               1982         [ +  + ]:           6524 :                 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
                               1983                 :                :             {
                               1984                 :           4120 :                 LWLockRelease(content_lock);
                               1985                 :           4120 :                 UnpinBuffer(buf_hdr);
                               1986                 :           4120 :                 goto again;
                               1987                 :                :             }
                               1988                 :                :         }
                               1989                 :                : 
                               1990                 :                :         /* OK, do the I/O */
                               1991                 :         252181 :         FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
                               1992                 :         252181 :         LWLockRelease(content_lock);
                               1993                 :                : 
  333                          1994                 :         252181 :         ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
                               1995                 :                :                                       &buf_hdr->tag);
                               1996                 :                :     }
                               1997                 :                : 
                               1998                 :                : 
  375                          1999         [ +  + ]:        1583037 :     if (buf_state & BM_VALID)
                               2000                 :                :     {
                               2001                 :                :         /*
                               2002                 :                :          * When a BufferAccessStrategy is in use, blocks evicted from shared
                               2003                 :                :          * buffers are counted as IOOP_EVICT in the corresponding context
                               2004                 :                :          * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
                               2005                 :                :          * strategy in two cases: 1) while initially claiming buffers for the
                               2006                 :                :          * strategy ring 2) to replace an existing strategy ring buffer
                               2007                 :                :          * because it is pinned or in use and cannot be reused.
                               2008                 :                :          *
                               2009                 :                :          * Blocks evicted from buffers already in the strategy ring are
                               2010                 :                :          * counted as IOOP_REUSE in the corresponding strategy context.
                               2011                 :                :          *
                               2012                 :                :          * At this point, we can accurately count evictions and reuses,
                               2013                 :                :          * because we have successfully claimed the valid buffer. Previously,
                               2014                 :                :          * we may have been forced to release the buffer due to concurrent
                               2015                 :                :          * pinners or erroring out.
                               2016                 :                :          */
                               2017                 :        1003490 :         pgstat_count_io_op(IOOBJECT_RELATION, io_context,
                               2018         [ +  + ]:        1003490 :                            from_ring ? IOOP_REUSE : IOOP_EVICT);
                               2019                 :                :     }
                               2020                 :                : 
                               2021                 :                :     /*
                               2022                 :                :      * If the buffer has an entry in the buffer mapping table, delete it. This
                               2023                 :                :      * can fail because another backend could have pinned or dirtied the
                               2024                 :                :      * buffer.
                               2025                 :                :      */
                               2026   [ +  +  +  + ]:        1583037 :     if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
                               2027                 :                :     {
                               2028                 :            278 :         UnpinBuffer(buf_hdr);
                               2029                 :            278 :         goto again;
                               2030                 :                :     }
                               2031                 :                : 
                               2032                 :                :     /* a final set of sanity checks */
                               2033                 :                : #ifdef USE_ASSERT_CHECKING
                               2034                 :        1582759 :     buf_state = pg_atomic_read_u32(&buf_hdr->state);
                               2035                 :                : 
                               2036         [ -  + ]:        1582759 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
                               2037         [ -  + ]:        1582759 :     Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
                               2038                 :                : 
                               2039                 :        1582759 :     CheckBufferIsPinnedOnce(buf);
                               2040                 :                : #endif
                               2041                 :                : 
                               2042                 :        1582759 :     return buf;
                               2043                 :                : }
                               2044                 :                : 
                               2045                 :                : /*
                               2046                 :                :  * Limit the number of pins a batch operation may additionally acquire, to
                               2047                 :                :  * avoid running out of pinnable buffers.
                               2048                 :                :  *
                               2049                 :                :  * One additional pin is always allowed, as otherwise the operation likely
                               2050                 :                :  * cannot be performed at all.
                               2051                 :                :  *
                               2052                 :                :  * The number of allowed pins for a backend is computed based on
                               2053                 :                :  * shared_buffers and the maximum number of connections possible. That's very
                               2054                 :                :  * pessimistic, but outside of toy-sized shared_buffers it should allow
                               2055                 :                :  * sufficient pins.
                               2056                 :                :  */
                               2057                 :                : void
                               2058                 :         495486 : LimitAdditionalPins(uint32 *additional_pins)
                               2059                 :                : {
                               2060                 :                :     uint32      max_backends;
                               2061                 :                :     int         max_proportional_pins;
                               2062                 :                : 
                               2063         [ +  + ]:         495486 :     if (*additional_pins <= 1)
                               2064                 :         177717 :         return;
                               2065                 :                : 
                               2066                 :         317769 :     max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
                               2067                 :         317769 :     max_proportional_pins = NBuffers / max_backends;
                               2068                 :                : 
                               2069                 :                :     /*
                               2070                 :                :      * Subtract the approximate number of buffers already pinned by this
                               2071                 :                :      * backend. We get the number of "overflowed" pins for free, but don't
                               2072                 :                :      * know the number of pins in PrivateRefCountArray. The cost of
                               2073                 :                :      * calculating that exactly doesn't seem worth it, so just assume the max.
                               2074                 :                :      */
                               2075                 :         317769 :     max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
                               2076                 :                : 
  265                          2077         [ +  + ]:         317769 :     if (max_proportional_pins <= 0)
  375                          2078                 :          72504 :         max_proportional_pins = 1;
                               2079                 :                : 
                               2080         [ +  + ]:         317769 :     if (*additional_pins > max_proportional_pins)
                               2081                 :          73214 :         *additional_pins = max_proportional_pins;
                               2082                 :                : }
                               2083                 :                : 
                               2084                 :                : /*
                               2085                 :                :  * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
                               2086                 :                :  * avoid duplicating the tracing and relpersistence related logic.
                               2087                 :                :  */
                               2088                 :                : static BlockNumber
  235 tmunro@postgresql.or     2089                 :         195797 : ExtendBufferedRelCommon(BufferManagerRelation bmr,
                               2090                 :                :                         ForkNumber fork,
                               2091                 :                :                         BufferAccessStrategy strategy,
                               2092                 :                :                         uint32 flags,
                               2093                 :                :                         uint32 extend_by,
                               2094                 :                :                         BlockNumber extend_upto,
                               2095                 :                :                         Buffer *buffers,
                               2096                 :                :                         uint32 *extended_by)
                               2097                 :                : {
                               2098                 :                :     BlockNumber first_block;
                               2099                 :                : 
                               2100                 :                :     TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
                               2101                 :                :                                          bmr.smgr->smgr_rlocator.locator.spcOid,
                               2102                 :                :                                          bmr.smgr->smgr_rlocator.locator.dbOid,
                               2103                 :                :                                          bmr.smgr->smgr_rlocator.locator.relNumber,
                               2104                 :                :                                          bmr.smgr->smgr_rlocator.backend,
                               2105                 :                :                                          extend_by);
                               2106                 :                : 
                               2107         [ +  + ]:         195797 :     if (bmr.relpersistence == RELPERSISTENCE_TEMP)
                               2108                 :           9044 :         first_block = ExtendBufferedRelLocal(bmr, fork, flags,
                               2109                 :                :                                              extend_by, extend_upto,
                               2110                 :                :                                              buffers, &extend_by);
                               2111                 :                :     else
                               2112                 :         186753 :         first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
                               2113                 :                :                                               extend_by, extend_upto,
                               2114                 :                :                                               buffers, &extend_by);
  375 andres@anarazel.de       2115                 :         195797 :     *extended_by = extend_by;
                               2116                 :                : 
                               2117                 :                :     TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
                               2118                 :                :                                         bmr.smgr->smgr_rlocator.locator.spcOid,
                               2119                 :                :                                         bmr.smgr->smgr_rlocator.locator.dbOid,
                               2120                 :                :                                         bmr.smgr->smgr_rlocator.locator.relNumber,
                               2121                 :                :                                         bmr.smgr->smgr_rlocator.backend,
                               2122                 :                :                                         *extended_by,
                               2123                 :                :                                         first_block);
                               2124                 :                : 
                               2125                 :         195797 :     return first_block;
                               2126                 :                : }
                               2127                 :                : 
                               2128                 :                : /*
                               2129                 :                :  * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
                               2130                 :                :  * shared buffers.
                               2131                 :                :  */
                               2132                 :                : static BlockNumber
  235 tmunro@postgresql.or     2133                 :         186753 : ExtendBufferedRelShared(BufferManagerRelation bmr,
                               2134                 :                :                         ForkNumber fork,
                               2135                 :                :                         BufferAccessStrategy strategy,
                               2136                 :                :                         uint32 flags,
                               2137                 :                :                         uint32 extend_by,
                               2138                 :                :                         BlockNumber extend_upto,
                               2139                 :                :                         Buffer *buffers,
                               2140                 :                :                         uint32 *extended_by)
                               2141                 :                : {
                               2142                 :                :     BlockNumber first_block;
  375 andres@anarazel.de       2143                 :         186753 :     IOContext   io_context = IOContextForStrategy(strategy);
                               2144                 :                :     instr_time  io_start;
                               2145                 :                : 
                               2146                 :         186753 :     LimitAdditionalPins(&extend_by);
                               2147                 :                : 
                               2148                 :                :     /*
                               2149                 :                :      * Acquire victim buffers for extension without holding extension lock.
                               2150                 :                :      * Writing out victim buffers is the most expensive part of extending the
                               2151                 :                :      * relation, particularly when doing so requires WAL flushes. Zeroing out
                               2152                 :                :      * the buffers is also quite expensive, so do that before holding the
                               2153                 :                :      * extension lock as well.
                               2154                 :                :      *
                               2155                 :                :      * These pages are pinned by us and not valid. While we hold the pin they
                               2156                 :                :      * can't be acquired as victim buffers by another backend.
                               2157                 :                :      */
                               2158         [ +  + ]:         393075 :     for (uint32 i = 0; i < extend_by; i++)
                               2159                 :                :     {
                               2160                 :                :         Block       buf_block;
                               2161                 :                : 
                               2162                 :         206322 :         buffers[i] = GetVictimBuffer(strategy, io_context);
                               2163                 :         206322 :         buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
                               2164                 :                : 
                               2165                 :                :         /* new buffers are zero-filled */
                               2166   [ +  -  +  -  :         206322 :         MemSet((char *) buf_block, 0, BLCKSZ);
                                     +  -  -  +  -  
                                                 - ]
                               2167                 :                :     }
                               2168                 :                : 
                               2169                 :                :     /*
                               2170                 :                :      * Lock relation against concurrent extensions, unless requested not to.
                               2171                 :                :      *
                               2172                 :                :      * We use the same extension lock for all forks. That's unnecessarily
                               2173                 :                :      * restrictive, but currently extensions for forks don't happen often
                               2174                 :                :      * enough to make it worth locking more granularly.
                               2175                 :                :      *
                               2176                 :                :      * Note that another backend might have extended the relation by the time
                               2177                 :                :      * we get the lock.
                               2178                 :                :      */
                               2179         [ +  + ]:         186753 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
  235 tmunro@postgresql.or     2180                 :         131904 :         LockRelationForExtension(bmr.rel, ExclusiveLock);
                               2181                 :                : 
                               2182                 :                :     /*
                               2183                 :                :      * If requested, invalidate size cache, so that smgrnblocks asks the
                               2184                 :                :      * kernel.
                               2185                 :                :      */
  375 andres@anarazel.de       2186         [ +  + ]:         186753 :     if (flags & EB_CLEAR_SIZE_CACHE)
  235 tmunro@postgresql.or     2187                 :           6383 :         bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
                               2188                 :                : 
                               2189                 :         186753 :     first_block = smgrnblocks(bmr.smgr, fork);
                               2190                 :                : 
                               2191                 :                :     /*
                               2192                 :                :      * Now that we have the accurate relation size, check if the caller wants
                               2193                 :                :      * us to extend to only up to a specific size. If there were concurrent
                               2194                 :                :      * extensions, we might have acquired too many buffers and need to release
                               2195                 :                :      * them.
                               2196                 :                :      */
  375 andres@anarazel.de       2197         [ +  + ]:         186753 :     if (extend_upto != InvalidBlockNumber)
                               2198                 :                :     {
                               2199                 :          55475 :         uint32      orig_extend_by = extend_by;
                               2200                 :                : 
                               2201         [ -  + ]:          55475 :         if (first_block > extend_upto)
  375 andres@anarazel.de       2202                 :UBC           0 :             extend_by = 0;
  375 andres@anarazel.de       2203         [ +  + ]:CBC       55475 :         else if ((uint64) first_block + extend_by > extend_upto)
                               2204                 :             13 :             extend_by = extend_upto - first_block;
                               2205                 :                : 
                               2206         [ +  + ]:          55506 :         for (uint32 i = extend_by; i < orig_extend_by; i++)
                               2207                 :                :         {
                               2208                 :             31 :             BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
                               2209                 :                : 
                               2210                 :                :             /*
                               2211                 :                :              * The victim buffer we acquired previously is clean and unused,
                               2212                 :                :              * let it be found again quickly
                               2213                 :                :              */
                               2214                 :             31 :             StrategyFreeBuffer(buf_hdr);
                               2215                 :             31 :             UnpinBuffer(buf_hdr);
                               2216                 :                :         }
                               2217                 :                : 
                               2218         [ +  + ]:          55475 :         if (extend_by == 0)
                               2219                 :                :         {
                               2220         [ +  - ]:             13 :             if (!(flags & EB_SKIP_EXTENSION_LOCK))
  235 tmunro@postgresql.or     2221                 :             13 :                 UnlockRelationForExtension(bmr.rel, ExclusiveLock);
  375 andres@anarazel.de       2222                 :             13 :             *extended_by = extend_by;
                               2223                 :             13 :             return first_block;
                               2224                 :                :         }
                               2225                 :                :     }
                               2226                 :                : 
                               2227                 :                :     /* Fail if relation is already at maximum possible length */
                               2228         [ -  + ]:         186740 :     if ((uint64) first_block + extend_by >= MaxBlockNumber)
  375 andres@anarazel.de       2229         [ #  # ]:UBC           0 :         ereport(ERROR,
                               2230                 :                :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                               2231                 :                :                  errmsg("cannot extend relation %s beyond %u blocks",
                               2232                 :                :                         relpath(bmr.smgr->smgr_rlocator, fork),
                               2233                 :                :                         MaxBlockNumber)));
                               2234                 :                : 
                               2235                 :                :     /*
                               2236                 :                :      * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
                               2237                 :                :      *
                               2238                 :                :      * This needs to happen before we extend the relation, because as soon as
                               2239                 :                :      * we do, other backends can start to read in those pages.
                               2240                 :                :      */
  208 peter@eisentraut.org     2241         [ +  + ]:GNC      393031 :     for (uint32 i = 0; i < extend_by; i++)
                               2242                 :                :     {
  375 andres@anarazel.de       2243                 :CBC      206291 :         Buffer      victim_buf = buffers[i];
                               2244                 :         206291 :         BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
                               2245                 :                :         BufferTag   tag;
                               2246                 :                :         uint32      hash;
                               2247                 :                :         LWLock     *partition_lock;
                               2248                 :                :         int         existing_id;
                               2249                 :                : 
                               2250                 :                :         /* in case we need to pin an existing buffer below */
  158 heikki.linnakangas@i     2251                 :GNC      206291 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               2252                 :         206291 :         ReservePrivateRefCountEntry();
                               2253                 :                : 
  235 tmunro@postgresql.or     2254                 :CBC      206291 :         InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
  375 andres@anarazel.de       2255                 :         206291 :         hash = BufTableHashCode(&tag);
                               2256                 :         206291 :         partition_lock = BufMappingPartitionLock(hash);
                               2257                 :                : 
                               2258                 :         206291 :         LWLockAcquire(partition_lock, LW_EXCLUSIVE);
                               2259                 :                : 
                               2260                 :         206291 :         existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
                               2261                 :                : 
                               2262                 :                :         /*
                               2263                 :                :          * We get here only in the corner case where we are trying to extend
                               2264                 :                :          * the relation but we found a pre-existing buffer. This can happen
                               2265                 :                :          * because a prior attempt at extending the relation failed, and
                               2266                 :                :          * because mdread doesn't complain about reads beyond EOF (when
                               2267                 :                :          * zero_damaged_pages is ON) and so a previous attempt to read a block
                               2268                 :                :          * beyond EOF could have left a "valid" zero-filled buffer.
                               2269                 :                :          * Unfortunately, we have also seen this case occurring because of
                               2270                 :                :          * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
                               2271                 :                :          * that doesn't account for a recent write. In that situation, the
                               2272                 :                :          * pre-existing buffer would contain valid data that we don't want to
                               2273                 :                :          * overwrite.  Since the legitimate cases should always have left a
                               2274                 :                :          * zero-filled buffer, complain if not PageIsNew.
                               2275                 :                :          */
                               2276         [ -  + ]:         206291 :         if (existing_id >= 0)
                               2277                 :                :         {
  375 andres@anarazel.de       2278                 :UBC           0 :             BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
                               2279                 :                :             Block       buf_block;
                               2280                 :                :             bool        valid;
                               2281                 :                : 
                               2282                 :                :             /*
                               2283                 :                :              * Pin the existing buffer before releasing the partition lock,
                               2284                 :                :              * preventing it from being evicted.
                               2285                 :                :              */
                               2286                 :              0 :             valid = PinBuffer(existing_hdr, strategy);
                               2287                 :                : 
                               2288                 :              0 :             LWLockRelease(partition_lock);
                               2289                 :                : 
                               2290                 :                :             /*
                               2291                 :                :              * The victim buffer we acquired previously is clean and unused,
                               2292                 :                :              * let it be found again quickly
                               2293                 :                :              */
                               2294                 :              0 :             StrategyFreeBuffer(victim_buf_hdr);
                               2295                 :              0 :             UnpinBuffer(victim_buf_hdr);
                               2296                 :                : 
                               2297                 :              0 :             buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
                               2298                 :              0 :             buf_block = BufHdrGetBlock(existing_hdr);
                               2299                 :                : 
                               2300   [ #  #  #  # ]:              0 :             if (valid && !PageIsNew((Page) buf_block))
                               2301         [ #  # ]:              0 :                 ereport(ERROR,
                               2302                 :                :                         (errmsg("unexpected data beyond EOF in block %u of relation %s",
                               2303                 :                :                                 existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
                               2304                 :                :                          errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
                               2305                 :                : 
                               2306                 :                :             /*
                               2307                 :                :              * We *must* do smgr[zero]extend before succeeding, else the page
                               2308                 :                :              * will not be reserved by the kernel, and the next P_NEW call
                               2309                 :                :              * will decide to return the same page.  Clear the BM_VALID bit,
                               2310                 :                :              * do StartBufferIO() and proceed.
                               2311                 :                :              *
                               2312                 :                :              * Loop to handle the very small possibility that someone re-sets
                               2313                 :                :              * BM_VALID between our clearing it and StartBufferIO inspecting
                               2314                 :                :              * it.
                               2315                 :                :              */
                               2316                 :                :             do
                               2317                 :                :             {
                               2318                 :              0 :                 uint32      buf_state = LockBufHdr(existing_hdr);
                               2319                 :                : 
                               2320                 :              0 :                 buf_state &= ~BM_VALID;
                               2321                 :              0 :                 UnlockBufHdr(existing_hdr, buf_state);
   11 tmunro@postgresql.or     2322         [ #  # ]:UNC           0 :             } while (!StartBufferIO(existing_hdr, true, false));
                               2323                 :                :         }
                               2324                 :                :         else
                               2325                 :                :         {
                               2326                 :                :             uint32      buf_state;
                               2327                 :                : 
  375 andres@anarazel.de       2328                 :CBC      206291 :             buf_state = LockBufHdr(victim_buf_hdr);
                               2329                 :                : 
                               2330                 :                :             /* some sanity checks while we hold the buffer header lock */
                               2331         [ -  + ]:         206291 :             Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
                               2332         [ -  + ]:         206291 :             Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
                               2333                 :                : 
                               2334                 :         206291 :             victim_buf_hdr->tag = tag;
                               2335                 :                : 
                               2336                 :         206291 :             buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
  235 tmunro@postgresql.or     2337   [ +  +  +  + ]:         206291 :             if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
  375 andres@anarazel.de       2338                 :         202198 :                 buf_state |= BM_PERMANENT;
                               2339                 :                : 
                               2340                 :         206291 :             UnlockBufHdr(victim_buf_hdr, buf_state);
                               2341                 :                : 
                               2342                 :         206291 :             LWLockRelease(partition_lock);
                               2343                 :                : 
                               2344                 :                :             /* XXX: could combine the locked operations in it with the above */
   11 tmunro@postgresql.or     2345                 :GNC      206291 :             StartBufferIO(victim_buf_hdr, true, false);
                               2346                 :                :         }
                               2347                 :                :     }
                               2348                 :                : 
  120 michael@paquier.xyz      2349                 :         186740 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               2350                 :                : 
                               2351                 :                :     /*
                               2352                 :                :      * Note: if smgrzeroextend fails, we will end up with buffers that are
                               2353                 :                :      * allocated but not marked BM_VALID.  The next relation extension will
                               2354                 :                :      * still select the same block number (because the relation didn't get any
                               2355                 :                :      * longer on disk) and so future attempts to extend the relation will find
                               2356                 :                :      * the same buffers (if they have not been recycled) but come right back
                               2357                 :                :      * here to try smgrzeroextend again.
                               2358                 :                :      *
                               2359                 :                :      * We don't need to set checksum for all-zero pages.
                               2360                 :                :      */
  235 tmunro@postgresql.or     2361                 :CBC      186740 :     smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
                               2362                 :                : 
                               2363                 :                :     /*
                               2364                 :                :      * Release the file-extension lock; it's now OK for someone else to extend
                               2365                 :                :      * the relation some more.
                               2366                 :                :      *
                               2367                 :                :      * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
                               2368                 :                :      * take noticeable time.
                               2369                 :                :      */
  375 andres@anarazel.de       2370         [ +  + ]:         186740 :     if (!(flags & EB_SKIP_EXTENSION_LOCK))
  235 tmunro@postgresql.or     2371                 :         131891 :         UnlockRelationForExtension(bmr.rel, ExclusiveLock);
                               2372                 :                : 
  373 andres@anarazel.de       2373                 :         186740 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
                               2374                 :                :                             io_start, extend_by);
                               2375                 :                : 
                               2376                 :                :     /* Set BM_VALID, terminate IO, and wake up any waiters */
  208 peter@eisentraut.org     2377         [ +  + ]:GNC      393031 :     for (uint32 i = 0; i < extend_by; i++)
                               2378                 :                :     {
  375 andres@anarazel.de       2379                 :CBC      206291 :         Buffer      buf = buffers[i];
                               2380                 :         206291 :         BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
                               2381                 :         206291 :         bool        lock = false;
                               2382                 :                : 
                               2383   [ +  +  +  + ]:         206291 :         if (flags & EB_LOCK_FIRST && i == 0)
                               2384                 :         131038 :             lock = true;
                               2385         [ +  + ]:          75253 :         else if (flags & EB_LOCK_TARGET)
                               2386                 :                :         {
                               2387         [ -  + ]:          48059 :             Assert(extend_upto != InvalidBlockNumber);
                               2388         [ +  + ]:          48059 :             if (first_block + i + 1 == extend_upto)
                               2389                 :          47502 :                 lock = true;
                               2390                 :                :         }
                               2391                 :                : 
                               2392         [ +  + ]:         206291 :         if (lock)
                               2393                 :         178540 :             LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
                               2394                 :                : 
  158 heikki.linnakangas@i     2395                 :GNC      206291 :         TerminateBufferIO(buf_hdr, false, BM_VALID, true);
                               2396                 :                :     }
                               2397                 :                : 
  375 andres@anarazel.de       2398                 :CBC      186740 :     pgBufferUsage.shared_blks_written += extend_by;
                               2399                 :                : 
                               2400                 :         186740 :     *extended_by = extend_by;
                               2401                 :                : 
                               2402                 :         186740 :     return first_block;
                               2403                 :                : }
                               2404                 :                : 
                               2405                 :                : /*
                               2406                 :                :  * BufferIsExclusiveLocked
                               2407                 :                :  *
                               2408                 :                :  *      Checks if buffer is exclusive-locked.
                               2409                 :                :  *
                               2410                 :                :  * Buffer must be pinned.
                               2411                 :                :  */
                               2412                 :                : bool
  174 jdavis@postgresql.or     2413                 :GNC    13823361 : BufferIsExclusiveLocked(Buffer buffer)
                               2414                 :                : {
                               2415                 :                :     BufferDesc *bufHdr;
                               2416                 :                : 
                               2417         [ -  + ]:       13823361 :     if (BufferIsLocal(buffer))
                               2418                 :                :     {
  174 jdavis@postgresql.or     2419                 :UNC           0 :         int         bufid = -buffer - 1;
                               2420                 :                : 
                               2421                 :              0 :         bufHdr = GetLocalBufferDescriptor(bufid);
                               2422                 :                :     }
                               2423                 :                :     else
                               2424                 :                :     {
  174 jdavis@postgresql.or     2425                 :GNC    13823361 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               2426                 :                :     }
                               2427                 :                : 
                               2428   [ -  +  -  +  :       13823361 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               2429                 :       13823361 :     return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
                               2430                 :                :                                 LW_EXCLUSIVE);
                               2431                 :                : }
                               2432                 :                : 
                               2433                 :                : /*
                               2434                 :                :  * BufferIsDirty
                               2435                 :                :  *
                               2436                 :                :  *      Checks if buffer is already dirty.
                               2437                 :                :  *
                               2438                 :                :  * Buffer must be pinned and exclusive-locked.  (Without an exclusive lock,
                               2439                 :                :  * the result may be stale before it's returned.)
                               2440                 :                :  */
                               2441                 :                : bool
                               2442                 :       13823361 : BufferIsDirty(Buffer buffer)
                               2443                 :                : {
                               2444                 :                :     BufferDesc *bufHdr;
                               2445                 :                : 
                               2446         [ -  + ]:       13823361 :     if (BufferIsLocal(buffer))
                               2447                 :                :     {
  174 jdavis@postgresql.or     2448                 :UNC           0 :         int         bufid = -buffer - 1;
                               2449                 :                : 
                               2450                 :              0 :         bufHdr = GetLocalBufferDescriptor(bufid);
                               2451                 :                :     }
                               2452                 :                :     else
                               2453                 :                :     {
  174 jdavis@postgresql.or     2454                 :GNC    13823361 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               2455                 :                :     }
                               2456                 :                : 
                               2457   [ -  +  -  +  :       13823361 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               2458         [ -  + ]:       13823361 :     Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
                               2459                 :                :                                 LW_EXCLUSIVE));
                               2460                 :                : 
                               2461                 :       13823361 :     return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
                               2462                 :                : }
                               2463                 :                : 
                               2464                 :                : /*
                               2465                 :                :  * MarkBufferDirty
                               2466                 :                :  *
                               2467                 :                :  *      Marks buffer contents as dirty (actual write happens later).
                               2468                 :                :  *
                               2469                 :                :  * Buffer must be pinned and exclusive-locked.  (If caller does not hold
                               2470                 :                :  * exclusive lock, then somebody could be in process of writing the buffer,
                               2471                 :                :  * leading to risk of bad data written to disk.)
                               2472                 :                :  */
                               2473                 :                : void
 6589 tgl@sss.pgh.pa.us        2474                 :CBC    20462434 : MarkBufferDirty(Buffer buffer)
                               2475                 :                : {
                               2476                 :                :     BufferDesc *bufHdr;
                               2477                 :                :     uint32      buf_state;
                               2478                 :                :     uint32      old_buf_state;
                               2479                 :                : 
 7121                          2480         [ -  + ]:       20462434 :     if (!BufferIsValid(buffer))
 4683 peter_e@gmx.net          2481         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               2482                 :                : 
 8780 tgl@sss.pgh.pa.us        2483         [ +  + ]:CBC    20462434 :     if (BufferIsLocal(buffer))
                               2484                 :                :     {
 6589                          2485                 :        1140952 :         MarkLocalBufferDirty(buffer);
 7974 bruce@momjian.us         2486                 :        1140952 :         return;
                               2487                 :                :     }
                               2488                 :                : 
 3363 andres@anarazel.de       2489                 :       19321482 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               2490                 :                : 
 3515                          2491   [ -  +  -  +  :       19321482 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 2778 simon@2ndQuadrant.co     2492         [ -  + ]:       19321482 :     Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
                               2493                 :                :                                 LW_EXCLUSIVE));
                               2494                 :                : 
 2926 andres@anarazel.de       2495                 :       19321482 :     old_buf_state = pg_atomic_read_u32(&bufHdr->state);
                               2496                 :                :     for (;;)
                               2497                 :                :     {
                               2498         [ +  + ]:       19321887 :         if (old_buf_state & BM_LOCKED)
                               2499                 :            108 :             old_buf_state = WaitBufHdrUnlocked(bufHdr);
                               2500                 :                : 
                               2501                 :       19321887 :         buf_state = old_buf_state;
                               2502                 :                : 
                               2503         [ -  + ]:       19321887 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               2504                 :       19321887 :         buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
                               2505                 :                : 
                               2506         [ +  + ]:       19321887 :         if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
                               2507                 :                :                                            buf_state))
                               2508                 :       19321482 :             break;
                               2509                 :                :     }
                               2510                 :                : 
                               2511                 :                :     /*
                               2512                 :                :      * If the buffer was not dirty already, do vacuum accounting.
                               2513                 :                :      */
                               2514         [ +  + ]:       19321482 :     if (!(old_buf_state & BM_DIRTY))
                               2515                 :                :     {
 4524 alvherre@alvh.no-ip.     2516                 :         594990 :         VacuumPageDirty++;
 4435 rhaas@postgresql.org     2517                 :         594990 :         pgBufferUsage.shared_blks_dirtied++;
 4524 alvherre@alvh.no-ip.     2518         [ +  + ]:         594990 :         if (VacuumCostActive)
                               2519                 :           6892 :             VacuumCostBalance += VacuumCostPageDirty;
                               2520                 :                :     }
                               2521                 :                : }
                               2522                 :                : 
                               2523                 :                : /*
                               2524                 :                :  * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
                               2525                 :                :  *
                               2526                 :                :  * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
                               2527                 :                :  * compared to calling the two routines separately.  Now it's mainly just
                               2528                 :                :  * a convenience function.  However, if the passed buffer is valid and
                               2529                 :                :  * already contains the desired block, we just return it as-is; and that
                               2530                 :                :  * does save considerable work compared to a full release and reacquire.
                               2531                 :                :  *
                               2532                 :                :  * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
                               2533                 :                :  * buffer actually needs to be released.  This case is the same as ReadBuffer,
                               2534                 :                :  * but can save some tests in the caller.
                               2535                 :                :  */
                               2536                 :                : Buffer
10141 scrappy@hub.org          2537                 :       24803481 : ReleaseAndReadBuffer(Buffer buffer,
                               2538                 :                :                      Relation relation,
                               2539                 :                :                      BlockNumber blockNum)
                               2540                 :                : {
 5421 bruce@momjian.us         2541                 :       24803481 :     ForkNumber  forkNum = MAIN_FORKNUM;
                               2542                 :                :     BufferDesc *bufHdr;
                               2543                 :                : 
 8373 tgl@sss.pgh.pa.us        2544         [ +  + ]:       24803481 :     if (BufferIsValid(buffer))
                               2545                 :                :     {
 3515 andres@anarazel.de       2546   [ -  +  +  +  :       14600072 :         Assert(BufferIsPinned(buffer));
                                              -  + ]
 8373 tgl@sss.pgh.pa.us        2547         [ +  + ]:       14600072 :         if (BufferIsLocal(buffer))
                               2548                 :                :         {
 3363 andres@anarazel.de       2549                 :         105224 :             bufHdr = GetLocalBufferDescriptor(-buffer - 1);
 8345 tgl@sss.pgh.pa.us        2550   [ +  +  +  - ]:         107234 :             if (bufHdr->tag.blockNum == blockNum &&
  599 rhaas@postgresql.org     2551         [ +  - ]:           4020 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
                               2552                 :           2010 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
 8345 tgl@sss.pgh.pa.us        2553                 :           2010 :                 return buffer;
  375 andres@anarazel.de       2554                 :         103214 :             UnpinLocalBuffer(buffer);
                               2555                 :                :         }
                               2556                 :                :         else
                               2557                 :                :         {
 3363                          2558                 :       14494848 :             bufHdr = GetBufferDescriptor(buffer - 1);
                               2559                 :                :             /* we have pin, so it's ok to examine tag without spinlock */
 8345 tgl@sss.pgh.pa.us        2560   [ +  +  +  - ]:       19365605 :             if (bufHdr->tag.blockNum == blockNum &&
  599 rhaas@postgresql.org     2561         [ +  - ]:        9741514 :                 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
                               2562                 :        4870757 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum)
 8345 tgl@sss.pgh.pa.us        2563                 :        4870757 :                 return buffer;
  562 michael@paquier.xyz      2564                 :        9624091 :             UnpinBuffer(bufHdr);
                               2565                 :                :         }
                               2566                 :                :     }
                               2567                 :                : 
 6981 tgl@sss.pgh.pa.us        2568                 :       19930714 :     return ReadBuffer(relation, blockNum);
                               2569                 :                : }
                               2570                 :                : 
                               2571                 :                : /*
                               2572                 :                :  * PinBuffer -- make buffer unavailable for replacement.
                               2573                 :                :  *
                               2574                 :                :  * For the default access strategy, the buffer's usage_count is incremented
                               2575                 :                :  * when we first pin it; for other strategies we just make sure the usage_count
                               2576                 :                :  * isn't zero.  (The idea of the latter is that we don't want synchronized
                               2577                 :                :  * heap scans to inflate the count, but we need it to not be zero to discourage
                               2578                 :                :  * other backends from stealing buffers from our ring.  As long as we cycle
                               2579                 :                :  * through the ring faster than the global clock-sweep cycles, buffers in
                               2580                 :                :  * our ring won't be chosen as victims for replacement by other backends.)
                               2581                 :                :  *
                               2582                 :                :  * This should be applied only to shared buffers, never local ones.
                               2583                 :                :  *
                               2584                 :                :  * Since buffers are pinned/unpinned very frequently, pin buffers without
                               2585                 :                :  * taking the buffer header lock; instead update the state variable in loop of
                               2586                 :                :  * CAS operations. Hopefully it's just a single CAS.
                               2587                 :                :  *
                               2588                 :                :  * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry()
                               2589                 :                :  * must have been done already.
                               2590                 :                :  *
                               2591                 :                :  * Returns true if buffer is BM_VALID, else false.  This provision allows
                               2592                 :                :  * some callers to avoid an extra spinlock cycle.
                               2593                 :                :  */
                               2594                 :                : static bool
 3072 rhaas@postgresql.org     2595                 :       49229176 : PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
                               2596                 :                : {
 3168 andres@anarazel.de       2597                 :       49229176 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               2598                 :                :     bool        result;
                               2599                 :                :     PrivateRefCountEntry *ref;
                               2600                 :                : 
  375                          2601         [ -  + ]:       49229176 :     Assert(!BufferIsLocal(b));
  158 heikki.linnakangas@i     2602         [ -  + ]:GNC    49229176 :     Assert(ReservedRefCountEntry != NULL);
                               2603                 :                : 
 3168 andres@anarazel.de       2604                 :CBC    49229176 :     ref = GetPrivateRefCountEntry(b, true);
                               2605                 :                : 
 3373                          2606         [ +  + ]:       49229176 :     if (ref == NULL)
                               2607                 :                :     {
                               2608                 :                :         uint32      buf_state;
                               2609                 :                :         uint32      old_buf_state;
                               2610                 :                : 
 3168                          2611                 :       47225541 :         ref = NewPrivateRefCountEntry(b);
                               2612                 :                : 
 2926                          2613                 :       47225541 :         old_buf_state = pg_atomic_read_u32(&buf->state);
                               2614                 :                :         for (;;)
                               2615                 :                :         {
                               2616         [ +  + ]:       47240049 :             if (old_buf_state & BM_LOCKED)
                               2617                 :           1074 :                 old_buf_state = WaitBufHdrUnlocked(buf);
                               2618                 :                : 
                               2619                 :       47240049 :             buf_state = old_buf_state;
                               2620                 :                : 
                               2621                 :                :             /* increase refcount */
                               2622                 :       47240049 :             buf_state += BUF_REFCOUNT_ONE;
                               2623                 :                : 
 2582 teodor@sigaev.ru         2624         [ +  + ]:       47240049 :             if (strategy == NULL)
                               2625                 :                :             {
                               2626                 :                :                 /* Default case: increase usagecount unless already max. */
                               2627         [ +  + ]:       46657503 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
                               2628                 :        2649640 :                     buf_state += BUF_USAGECOUNT_ONE;
                               2629                 :                :             }
                               2630                 :                :             else
                               2631                 :                :             {
                               2632                 :                :                 /*
                               2633                 :                :                  * Ring buffers shouldn't evict others from pool.  Thus we
                               2634                 :                :                  * don't make usagecount more than 1.
                               2635                 :                :                  */
                               2636         [ +  + ]:         582546 :                 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
                               2637                 :          22238 :                     buf_state += BUF_USAGECOUNT_ONE;
                               2638                 :                :             }
                               2639                 :                : 
 2926 andres@anarazel.de       2640         [ +  + ]:       47240049 :             if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
                               2641                 :                :                                                buf_state))
                               2642                 :                :             {
                               2643                 :       47225541 :                 result = (buf_state & BM_VALID) != 0;
                               2644                 :                : 
                               2645                 :                :                 /*
                               2646                 :                :                  * Assume that we acquired a buffer pin for the purposes of
                               2647                 :                :                  * Valgrind buffer client checks (even in !result case) to
                               2648                 :                :                  * keep things simple.  Buffers that are unsafe to access are
                               2649                 :                :                  * not generally guaranteed to be marked undefined or
                               2650                 :                :                  * non-accessible in any case.
                               2651                 :                :                  */
                               2652                 :                :                 VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
                               2653                 :       47225541 :                 break;
                               2654                 :                :             }
                               2655                 :                :         }
                               2656                 :                :     }
                               2657                 :                :     else
                               2658                 :                :     {
                               2659                 :                :         /*
                               2660                 :                :          * If we previously pinned the buffer, it is likely to be valid, but
                               2661                 :                :          * it may not be if StartReadBuffers() was called and
                               2662                 :                :          * WaitReadBuffers() hasn't been called yet.  We'll check by loading
                               2663                 :                :          * the flags without locking.  This is racy, but it's OK to return
                               2664                 :                :          * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
                               2665                 :                :          * it'll see that it's now valid.
                               2666                 :                :          *
                               2667                 :                :          * Note: We deliberately avoid a Valgrind client request here.
                               2668                 :                :          * Individual access methods can optionally superimpose buffer page
                               2669                 :                :          * client requests on top of our client requests to enforce that
                               2670                 :                :          * buffers are only accessed while locked (and pinned).  It's possible
                               2671                 :                :          * that the buffer page is legitimately non-accessible here.  We
                               2672                 :                :          * cannot meddle with that.
                               2673                 :                :          */
   11 tmunro@postgresql.or     2674                 :GNC     2003635 :         result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
                               2675                 :                :     }
                               2676                 :                : 
 3515 andres@anarazel.de       2677                 :CBC    49229176 :     ref->refcount++;
                               2678         [ -  + ]:       49229176 :     Assert(ref->refcount > 0);
 3168                          2679                 :       49229176 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 6981 tgl@sss.pgh.pa.us        2680                 :       49229176 :     return result;
                               2681                 :                : }
                               2682                 :                : 
                               2683                 :                : /*
                               2684                 :                :  * PinBuffer_Locked -- as above, but caller already locked the buffer header.
                               2685                 :                :  * The spinlock is released before return.
                               2686                 :                :  *
                               2687                 :                :  * As this function is called with the spinlock held, the caller has to
                               2688                 :                :  * previously call ReservePrivateRefCountEntry() and
                               2689                 :                :  * ResourceOwnerEnlarge(CurrentResourceOwner);
                               2690                 :                :  *
                               2691                 :                :  * Currently, no callers of this function want to modify the buffer's
                               2692                 :                :  * usage_count at all, so there's no need for a strategy parameter.
                               2693                 :                :  * Also we don't bother with a BM_VALID test (the caller could check that for
                               2694                 :                :  * itself).
                               2695                 :                :  *
                               2696                 :                :  * Also all callers only ever use this function when it's known that the
                               2697                 :                :  * buffer can't have a preexisting pin by this backend. That allows us to skip
                               2698                 :                :  * searching the private refcount array & hash, which is a boon, because the
                               2699                 :                :  * spinlock is still held.
                               2700                 :                :  *
                               2701                 :                :  * Note: use of this routine is frequently mandatory, not just an optimization
                               2702                 :                :  * to save a spin lock/unlock cycle, because we need to pin a buffer before
                               2703                 :                :  * its state can change under us.
                               2704                 :                :  */
                               2705                 :                : static void
 3072 rhaas@postgresql.org     2706                 :        2381770 : PinBuffer_Locked(BufferDesc *buf)
                               2707                 :                : {
                               2708                 :                :     Buffer      b;
                               2709                 :                :     PrivateRefCountEntry *ref;
                               2710                 :                :     uint32      buf_state;
                               2711                 :                : 
                               2712                 :                :     /*
                               2713                 :                :      * As explained, We don't expect any preexisting pins. That allows us to
                               2714                 :                :      * manipulate the PrivateRefCount after releasing the spinlock
                               2715                 :                :      */
 3168 andres@anarazel.de       2716         [ -  + ]:        2381770 :     Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
                               2717                 :                : 
                               2718                 :                :     /*
                               2719                 :                :      * Buffer can't have a preexisting pin, so mark its page as defined to
                               2720                 :                :      * Valgrind (this is similar to the PinBuffer() case where the backend
                               2721                 :                :      * doesn't already have a buffer pin)
                               2722                 :                :      */
                               2723                 :                :     VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
                               2724                 :                : 
                               2725                 :                :     /*
                               2726                 :                :      * Since we hold the buffer spinlock, we can update the buffer state and
                               2727                 :                :      * release the lock in one operation.
                               2728                 :                :      */
 2926                          2729                 :        2381770 :     buf_state = pg_atomic_read_u32(&buf->state);
                               2730         [ -  + ]:        2381770 :     Assert(buf_state & BM_LOCKED);
                               2731                 :        2381770 :     buf_state += BUF_REFCOUNT_ONE;
                               2732                 :        2381770 :     UnlockBufHdr(buf, buf_state);
                               2733                 :                : 
 3168                          2734                 :        2381770 :     b = BufferDescriptorGetBuffer(buf);
                               2735                 :                : 
                               2736                 :        2381770 :     ref = NewPrivateRefCountEntry(b);
 3515                          2737                 :        2381770 :     ref->refcount++;
                               2738                 :                : 
 3168                          2739                 :        2381770 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 7300 tgl@sss.pgh.pa.us        2740                 :        2381770 : }
                               2741                 :                : 
                               2742                 :                : /*
                               2743                 :                :  * UnpinBuffer -- make buffer available for replacement.
                               2744                 :                :  *
                               2745                 :                :  * This should be applied only to shared buffers, never local ones.  This
                               2746                 :                :  * always adjusts CurrentResourceOwner.
                               2747                 :                :  */
                               2748                 :                : static void
  562 michael@paquier.xyz      2749                 :       60818830 : UnpinBuffer(BufferDesc *buf)
                               2750                 :                : {
  158 heikki.linnakangas@i     2751                 :GNC    60818830 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               2752                 :                : 
                               2753                 :       60818830 :     ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
                               2754                 :       60818830 :     UnpinBufferNoOwner(buf);
                               2755                 :       60818830 : }
                               2756                 :                : 
                               2757                 :                : static void
                               2758                 :       60822581 : UnpinBufferNoOwner(BufferDesc *buf)
                               2759                 :                : {
                               2760                 :                :     PrivateRefCountEntry *ref;
 3168 andres@anarazel.de       2761                 :CBC    60822581 :     Buffer      b = BufferDescriptorGetBuffer(buf);
                               2762                 :                : 
  375                          2763         [ -  + ]:       60822581 :     Assert(!BufferIsLocal(b));
                               2764                 :                : 
                               2765                 :                :     /* not moving as we're likely deleting it soon anyway */
 3168                          2766                 :       60822581 :     ref = GetPrivateRefCountEntry(b, false);
 3515                          2767         [ -  + ]:       60822581 :     Assert(ref != NULL);
                               2768         [ -  + ]:       60822581 :     Assert(ref->refcount > 0);
                               2769                 :       60822581 :     ref->refcount--;
                               2770         [ +  + ]:       60822581 :     if (ref->refcount == 0)
                               2771                 :                :     {
                               2772                 :                :         uint32      buf_state;
                               2773                 :                :         uint32      old_buf_state;
                               2774                 :                : 
                               2775                 :                :         /*
                               2776                 :                :          * Mark buffer non-accessible to Valgrind.
                               2777                 :                :          *
                               2778                 :                :          * Note that the buffer may have already been marked non-accessible
                               2779                 :                :          * within access method code that enforces that buffers are only
                               2780                 :                :          * accessed while a buffer lock is held.
                               2781                 :                :          */
                               2782                 :                :         VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
                               2783                 :                : 
                               2784                 :                :         /* I'd better not still hold the buffer content lock */
 3043 rhaas@postgresql.org     2785         [ -  + ]:       49607311 :         Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
                               2786                 :                : 
                               2787                 :                :         /*
                               2788                 :                :          * Decrement the shared reference count.
                               2789                 :                :          *
                               2790                 :                :          * Since buffer spinlock holder can update status using just write,
                               2791                 :                :          * it's not safe to use atomic decrement here; thus use a CAS loop.
                               2792                 :                :          */
 2926 andres@anarazel.de       2793                 :       49607311 :         old_buf_state = pg_atomic_read_u32(&buf->state);
                               2794                 :                :         for (;;)
                               2795                 :                :         {
                               2796         [ +  + ]:       49622418 :             if (old_buf_state & BM_LOCKED)
                               2797                 :            952 :                 old_buf_state = WaitBufHdrUnlocked(buf);
                               2798                 :                : 
                               2799                 :       49622418 :             buf_state = old_buf_state;
                               2800                 :                : 
                               2801                 :       49622418 :             buf_state -= BUF_REFCOUNT_ONE;
                               2802                 :                : 
                               2803         [ +  + ]:       49622418 :             if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
                               2804                 :                :                                                buf_state))
                               2805                 :       49607311 :                 break;
                               2806                 :                :         }
                               2807                 :                : 
                               2808                 :                :         /* Support LockBufferForCleanup() */
                               2809         [ +  + ]:       49607311 :         if (buf_state & BM_PIN_COUNT_WAITER)
                               2810                 :                :         {
                               2811                 :                :             /*
                               2812                 :                :              * Acquire the buffer header lock, re-check that there's a waiter.
                               2813                 :                :              * Another backend could have unpinned this buffer, and already
                               2814                 :                :              * woken up the waiter.  There's no danger of the buffer being
                               2815                 :                :              * replaced after we unpinned it above, as it's pinned by the
                               2816                 :                :              * waiter.
                               2817                 :                :              */
 2926 andres@anarazel.de       2818                 :GBC           2 :             buf_state = LockBufHdr(buf);
                               2819                 :                : 
                               2820         [ +  - ]:              2 :             if ((buf_state & BM_PIN_COUNT_WAITER) &&
                               2821         [ +  - ]:              2 :                 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               2822                 :              2 :             {
                               2823                 :                :                 /* we just released the last pin other than the waiter's */
  850 tmunro@postgresql.or     2824                 :              2 :                 int         wait_backend_pgprocno = buf->wait_backend_pgprocno;
                               2825                 :                : 
 2926 andres@anarazel.de       2826                 :              2 :                 buf_state &= ~BM_PIN_COUNT_WAITER;
                               2827                 :              2 :                 UnlockBufHdr(buf, buf_state);
  850 tmunro@postgresql.or     2828                 :              2 :                 ProcSendSignal(wait_backend_pgprocno);
                               2829                 :                :             }
                               2830                 :                :             else
 2926 andres@anarazel.de       2831                 :UBC           0 :                 UnlockBufHdr(buf, buf_state);
                               2832                 :                :         }
 3515 andres@anarazel.de       2833                 :CBC    49607311 :         ForgetPrivateRefCountEntry(ref);
                               2834                 :                :     }
 7300 tgl@sss.pgh.pa.us        2835                 :       60822581 : }
                               2836                 :                : 
                               2837                 :                : #define ST_SORT sort_checkpoint_bufferids
                               2838                 :                : #define ST_ELEMENT_TYPE CkptSortItem
                               2839                 :                : #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
                               2840                 :                : #define ST_SCOPE static
                               2841                 :                : #define ST_DEFINE
                               2842                 :                : #include <lib/sort_template.h>
                               2843                 :                : 
                               2844                 :                : /*
                               2845                 :                :  * BufferSync -- Write out all dirty buffers in the pool.
                               2846                 :                :  *
                               2847                 :                :  * This is called at checkpoint time to write out all dirty shared buffers.
                               2848                 :                :  * The checkpoint request flags should be passed in.  If CHECKPOINT_IMMEDIATE
                               2849                 :                :  * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
                               2850                 :                :  * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
                               2851                 :                :  * unlogged buffers, which are otherwise skipped.  The remaining flags
                               2852                 :                :  * currently have no effect here.
                               2853                 :                :  */
                               2854                 :                : static void
 6135                          2855                 :           1153 : BufferSync(int flags)
                               2856                 :                : {
                               2857                 :                :     uint32      buf_state;
                               2858                 :                :     int         buf_id;
                               2859                 :                :     int         num_to_scan;
                               2860                 :                :     int         num_spaces;
                               2861                 :                :     int         num_processed;
                               2862                 :                :     int         num_written;
 2977 andres@anarazel.de       2863                 :           1153 :     CkptTsStatus *per_ts_stat = NULL;
                               2864                 :                :     Oid         last_tsid;
                               2865                 :                :     binaryheap *ts_heap;
                               2866                 :                :     int         i;
 4855 rhaas@postgresql.org     2867                 :           1153 :     int         mask = BM_DIRTY;
                               2868                 :                :     WritebackContext wb_context;
                               2869                 :                : 
                               2870                 :                :     /*
                               2871                 :                :      * Unless this is a shutdown checkpoint or we have been explicitly told,
                               2872                 :                :      * we write only permanent, dirty buffers.  But at shutdown or end of
                               2873                 :                :      * recovery, we write all dirty buffers.
                               2874                 :                :      */
 3464 andres@anarazel.de       2875         [ +  + ]:           1153 :     if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
                               2876                 :                :                     CHECKPOINT_FLUSH_ALL))))
 4482 rhaas@postgresql.org     2877                 :            470 :         mask |= BM_PERMANENT;
                               2878                 :                : 
                               2879                 :                :     /*
                               2880                 :                :      * Loop over all buffers, and mark the ones that need to be written with
                               2881                 :                :      * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_scan), so that we
                               2882                 :                :      * can estimate how much work needs to be done.
                               2883                 :                :      *
                               2884                 :                :      * This allows us to write only those pages that were dirty when the
                               2885                 :                :      * checkpoint began, and not those that get dirtied while it proceeds.
                               2886                 :                :      * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
                               2887                 :                :      * later in this function, or by normal backends or the bgwriter cleaning
                               2888                 :                :      * scan, the flag is cleared.  Any buffer dirtied after this point won't
                               2889                 :                :      * have the flag set.
                               2890                 :                :      *
                               2891                 :                :      * Note that if we fail to write some buffer, we may leave buffers with
                               2892                 :                :      * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
                               2893                 :                :      * certainly need to be written for the next checkpoint attempt, too.
                               2894                 :                :      */
 2977 andres@anarazel.de       2895                 :           1153 :     num_to_scan = 0;
 6135 tgl@sss.pgh.pa.us        2896         [ +  + ]:       10698353 :     for (buf_id = 0; buf_id < NBuffers; buf_id++)
                               2897                 :                :     {
 3072 rhaas@postgresql.org     2898                 :       10697200 :         BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
                               2899                 :                : 
                               2900                 :                :         /*
                               2901                 :                :          * Header spinlock is enough to examine BM_DIRTY, see comment in
                               2902                 :                :          * SyncOneBuffer.
                               2903                 :                :          */
 2926 andres@anarazel.de       2904                 :       10697200 :         buf_state = LockBufHdr(bufHdr);
                               2905                 :                : 
                               2906         [ +  + ]:       10697200 :         if ((buf_state & mask) == mask)
                               2907                 :                :         {
                               2908                 :                :             CkptSortItem *item;
                               2909                 :                : 
                               2910                 :         241920 :             buf_state |= BM_CHECKPOINT_NEEDED;
                               2911                 :                : 
 2977                          2912                 :         241920 :             item = &CkptBufferIds[num_to_scan++];
                               2913                 :         241920 :             item->buf_id = buf_id;
  599 rhaas@postgresql.org     2914                 :         241920 :             item->tsId = bufHdr->tag.spcOid;
                               2915                 :         241920 :             item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
                               2916                 :         241920 :             item->forkNum = BufTagGetForkNum(&bufHdr->tag);
 2977 andres@anarazel.de       2917                 :         241920 :             item->blockNum = bufHdr->tag.blockNum;
                               2918                 :                :         }
                               2919                 :                : 
 2926                          2920                 :       10697200 :         UnlockBufHdr(bufHdr, buf_state);
                               2921                 :                : 
                               2922                 :                :         /* Check for barrier events in case NBuffers is large. */
 1578 rhaas@postgresql.org     2923         [ -  + ]:       10697200 :         if (ProcSignalBarrierPending)
 1578 rhaas@postgresql.org     2924                 :UBC           0 :             ProcessProcSignalBarrier();
                               2925                 :                :     }
                               2926                 :                : 
 2977 andres@anarazel.de       2927         [ +  + ]:CBC        1153 :     if (num_to_scan == 0)
 6135 tgl@sss.pgh.pa.us        2928                 :            399 :         return;                 /* nothing to do */
                               2929                 :                : 
 2977 andres@anarazel.de       2930                 :            754 :     WritebackContextInit(&wb_context, &checkpoint_flush_after);
                               2931                 :                : 
                               2932                 :                :     TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
                               2933                 :                : 
                               2934                 :                :     /*
                               2935                 :                :      * Sort buffers that need to be written to reduce the likelihood of random
                               2936                 :                :      * IO. The sorting is also important for the implementation of balancing
                               2937                 :                :      * writes between tablespaces. Without balancing writes we'd potentially
                               2938                 :                :      * end up writing to the tablespaces one-by-one; possibly overloading the
                               2939                 :                :      * underlying system.
                               2940                 :                :      */
 1129 tmunro@postgresql.or     2941                 :            754 :     sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
                               2942                 :                : 
 2977 andres@anarazel.de       2943                 :            754 :     num_spaces = 0;
                               2944                 :                : 
                               2945                 :                :     /*
                               2946                 :                :      * Allocate progress status for each tablespace with buffers that need to
                               2947                 :                :      * be flushed. This requires the to-be-flushed array to be sorted.
                               2948                 :                :      */
                               2949                 :            754 :     last_tsid = InvalidOid;
                               2950         [ +  + ]:         242674 :     for (i = 0; i < num_to_scan; i++)
                               2951                 :                :     {
                               2952                 :                :         CkptTsStatus *s;
                               2953                 :                :         Oid         cur_tsid;
                               2954                 :                : 
                               2955                 :         241920 :         cur_tsid = CkptBufferIds[i].tsId;
                               2956                 :                : 
                               2957                 :                :         /*
                               2958                 :                :          * Grow array of per-tablespace status structs, every time a new
                               2959                 :                :          * tablespace is found.
                               2960                 :                :          */
                               2961   [ +  +  +  + ]:         241920 :         if (last_tsid == InvalidOid || last_tsid != cur_tsid)
                               2962                 :           1129 :         {
                               2963                 :                :             Size        sz;
                               2964                 :                : 
                               2965                 :           1129 :             num_spaces++;
                               2966                 :                : 
                               2967                 :                :             /*
                               2968                 :                :              * Not worth adding grow-by-power-of-2 logic here - even with a
                               2969                 :                :              * few hundred tablespaces this should be fine.
                               2970                 :                :              */
                               2971                 :           1129 :             sz = sizeof(CkptTsStatus) * num_spaces;
                               2972                 :                : 
                               2973         [ +  + ]:           1129 :             if (per_ts_stat == NULL)
                               2974                 :            754 :                 per_ts_stat = (CkptTsStatus *) palloc(sz);
                               2975                 :                :             else
                               2976                 :            375 :                 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
                               2977                 :                : 
                               2978                 :           1129 :             s = &per_ts_stat[num_spaces - 1];
                               2979                 :           1129 :             memset(s, 0, sizeof(*s));
                               2980                 :           1129 :             s->tsId = cur_tsid;
                               2981                 :                : 
                               2982                 :                :             /*
                               2983                 :                :              * The first buffer in this tablespace. As CkptBufferIds is sorted
                               2984                 :                :              * by tablespace all (s->num_to_scan) buffers in this tablespace
                               2985                 :                :              * will follow afterwards.
                               2986                 :                :              */
                               2987                 :           1129 :             s->index = i;
                               2988                 :                : 
                               2989                 :                :             /*
                               2990                 :                :              * progress_slice will be determined once we know how many buffers
                               2991                 :                :              * are in each tablespace, i.e. after this loop.
                               2992                 :                :              */
                               2993                 :                : 
                               2994                 :           1129 :             last_tsid = cur_tsid;
                               2995                 :                :         }
                               2996                 :                :         else
                               2997                 :                :         {
                               2998                 :         240791 :             s = &per_ts_stat[num_spaces - 1];
                               2999                 :                :         }
                               3000                 :                : 
                               3001                 :         241920 :         s->num_to_scan++;
                               3002                 :                : 
                               3003                 :                :         /* Check for barrier events. */
 1578 rhaas@postgresql.org     3004         [ -  + ]:         241920 :         if (ProcSignalBarrierPending)
 1578 rhaas@postgresql.org     3005                 :UBC           0 :             ProcessProcSignalBarrier();
                               3006                 :                :     }
                               3007                 :                : 
 2977 andres@anarazel.de       3008         [ -  + ]:CBC         754 :     Assert(num_spaces > 0);
                               3009                 :                : 
                               3010                 :                :     /*
                               3011                 :                :      * Build a min-heap over the write-progress in the individual tablespaces,
                               3012                 :                :      * and compute how large a portion of the total progress a single
                               3013                 :                :      * processed buffer is.
                               3014                 :                :      */
                               3015                 :            754 :     ts_heap = binaryheap_allocate(num_spaces,
                               3016                 :                :                                   ts_ckpt_progress_comparator,
                               3017                 :                :                                   NULL);
                               3018                 :                : 
                               3019         [ +  + ]:           1883 :     for (i = 0; i < num_spaces; i++)
                               3020                 :                :     {
                               3021                 :           1129 :         CkptTsStatus *ts_stat = &per_ts_stat[i];
                               3022                 :                : 
                               3023                 :           1129 :         ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
                               3024                 :                : 
                               3025                 :           1129 :         binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
                               3026                 :                :     }
                               3027                 :                : 
                               3028                 :            754 :     binaryheap_build(ts_heap);
                               3029                 :                : 
                               3030                 :                :     /*
                               3031                 :                :      * Iterate through to-be-checkpointed buffers and write the ones (still)
                               3032                 :                :      * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
                               3033                 :                :      * tablespaces; otherwise the sorting would lead to only one tablespace
                               3034                 :                :      * receiving writes at a time, making inefficient use of the hardware.
                               3035                 :                :      */
                               3036                 :            754 :     num_processed = 0;
 6135 tgl@sss.pgh.pa.us        3037                 :            754 :     num_written = 0;
 2977 andres@anarazel.de       3038         [ +  + ]:         242556 :     while (!binaryheap_empty(ts_heap))
                               3039                 :                :     {
                               3040                 :         241807 :         BufferDesc *bufHdr = NULL;
                               3041                 :                :         CkptTsStatus *ts_stat = (CkptTsStatus *)
  331 tgl@sss.pgh.pa.us        3042                 :         241807 :             DatumGetPointer(binaryheap_first(ts_heap));
                               3043                 :                : 
 2977 andres@anarazel.de       3044                 :         241807 :         buf_id = CkptBufferIds[ts_stat->index].buf_id;
                               3045         [ -  + ]:         241807 :         Assert(buf_id != -1);
                               3046                 :                : 
                               3047                 :         241807 :         bufHdr = GetBufferDescriptor(buf_id);
                               3048                 :                : 
                               3049                 :         241807 :         num_processed++;
                               3050                 :                : 
                               3051                 :                :         /*
                               3052                 :                :          * We don't need to acquire the lock here, because we're only looking
                               3053                 :                :          * at a single bit. It's possible that someone else writes the buffer
                               3054                 :                :          * and clears the flag right after we check, but that doesn't matter
                               3055                 :                :          * since SyncOneBuffer will then do nothing.  However, there is a
                               3056                 :                :          * further race condition: it's conceivable that between the time we
                               3057                 :                :          * examine the bit here and the time SyncOneBuffer acquires the lock,
                               3058                 :                :          * someone else not only wrote the buffer but replaced it with another
                               3059                 :                :          * page and dirtied it.  In that improbable case, SyncOneBuffer will
                               3060                 :                :          * write the buffer though we didn't need to.  It doesn't seem worth
                               3061                 :                :          * guarding against this, though.
                               3062                 :                :          */
 2926                          3063         [ +  + ]:         241807 :         if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
                               3064                 :                :         {
 2977                          3065         [ +  - ]:         239677 :             if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
                               3066                 :                :             {
                               3067                 :                :                 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
  167 michael@paquier.xyz      3068                 :GNC      239677 :                 PendingCheckpointerStats.buffers_written++;
 6135 tgl@sss.pgh.pa.us        3069                 :CBC      239677 :                 num_written++;
                               3070                 :                :             }
                               3071                 :                :         }
                               3072                 :                : 
                               3073                 :                :         /*
                               3074                 :                :          * Measure progress independent of actually having to flush the buffer
                               3075                 :                :          * - otherwise writing become unbalanced.
                               3076                 :                :          */
 2977 andres@anarazel.de       3077                 :         241807 :         ts_stat->progress += ts_stat->progress_slice;
                               3078                 :         241807 :         ts_stat->num_scanned++;
                               3079                 :         241807 :         ts_stat->index++;
                               3080                 :                : 
                               3081                 :                :         /* Have all the buffers from the tablespace been processed? */
                               3082         [ +  + ]:         241807 :         if (ts_stat->num_scanned == ts_stat->num_to_scan)
                               3083                 :                :         {
                               3084                 :           1125 :             binaryheap_remove_first(ts_heap);
                               3085                 :                :         }
                               3086                 :                :         else
                               3087                 :                :         {
                               3088                 :                :             /* update heap with the new progress */
                               3089                 :         240682 :             binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
                               3090                 :                :         }
                               3091                 :                : 
                               3092                 :                :         /*
                               3093                 :                :          * Sleep to throttle our I/O rate.
                               3094                 :                :          *
                               3095                 :                :          * (This will check for barrier events even if it doesn't sleep.)
                               3096                 :                :          */
                               3097                 :         241807 :         CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
                               3098                 :                :     }
                               3099                 :                : 
                               3100                 :                :     /*
                               3101                 :                :      * Issue all pending flushes. Only checkpointer calls BufferSync(), so
                               3102                 :                :      * IOContext will always be IOCONTEXT_NORMAL.
                               3103                 :                :      */
  333                          3104                 :            749 :     IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
                               3105                 :                : 
 2977                          3106                 :            749 :     pfree(per_ts_stat);
                               3107                 :            749 :     per_ts_stat = NULL;
                               3108                 :            749 :     binaryheap_free(ts_heap);
                               3109                 :                : 
                               3110                 :                :     /*
                               3111                 :                :      * Update checkpoint statistics. As noted above, this doesn't include
                               3112                 :                :      * buffers written by other backends or bgwriter scan.
                               3113                 :                :      */
 6133 tgl@sss.pgh.pa.us        3114                 :            749 :     CheckpointStats.ckpt_bufs_written += num_written;
                               3115                 :                : 
                               3116                 :                :     TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
                               3117                 :                : }
                               3118                 :                : 
                               3119                 :                : /*
                               3120                 :                :  * BgBufferSync -- Write out some dirty buffers in the pool.
                               3121                 :                :  *
                               3122                 :                :  * This is called periodically by the background writer process.
                               3123                 :                :  *
                               3124                 :                :  * Returns true if it's appropriate for the bgwriter process to go into
                               3125                 :                :  * low-power hibernation mode.  (This happens if the strategy clock sweep
                               3126                 :                :  * has been "lapped" and no buffer allocations have occurred recently,
                               3127                 :                :  * or if the bgwriter has been effectively disabled by setting
                               3128                 :                :  * bgwriter_lru_maxpages to 0.)
                               3129                 :                :  */
                               3130                 :                : bool
 2977 andres@anarazel.de       3131                 :          15299 : BgBufferSync(WritebackContext *wb_context)
                               3132                 :                : {
                               3133                 :                :     /* info obtained from freelist.c */
                               3134                 :                :     int         strategy_buf_id;
                               3135                 :                :     uint32      strategy_passes;
                               3136                 :                :     uint32      recent_alloc;
                               3137                 :                : 
                               3138                 :                :     /*
                               3139                 :                :      * Information saved between calls so we can determine the strategy
                               3140                 :                :      * point's advance rate and avoid scanning already-cleaned buffers.
                               3141                 :                :      */
                               3142                 :                :     static bool saved_info_valid = false;
                               3143                 :                :     static int  prev_strategy_buf_id;
                               3144                 :                :     static uint32 prev_strategy_passes;
                               3145                 :                :     static int  next_to_clean;
                               3146                 :                :     static uint32 next_passes;
                               3147                 :                : 
                               3148                 :                :     /* Moving averages of allocation rate and clean-buffer density */
                               3149                 :                :     static float smoothed_alloc = 0;
                               3150                 :                :     static float smoothed_density = 10.0;
                               3151                 :                : 
                               3152                 :                :     /* Potentially these could be tunables, but for now, not */
 6046 tgl@sss.pgh.pa.us        3153                 :          15299 :     float       smoothing_samples = 16;
                               3154                 :          15299 :     float       scan_whole_pool_milliseconds = 120000.0;
                               3155                 :                : 
                               3156                 :                :     /* Used to compute how far we scan ahead */
                               3157                 :                :     long        strategy_delta;
                               3158                 :                :     int         bufs_to_lap;
                               3159                 :                :     int         bufs_ahead;
                               3160                 :                :     float       scans_per_alloc;
                               3161                 :                :     int         reusable_buffers_est;
                               3162                 :                :     int         upcoming_alloc_est;
                               3163                 :                :     int         min_scan_buffers;
                               3164                 :                : 
                               3165                 :                :     /* Variables for the scanning loop proper */
                               3166                 :                :     int         num_to_scan;
                               3167                 :                :     int         num_written;
                               3168                 :                :     int         reusable_buffers;
                               3169                 :                : 
                               3170                 :                :     /* Variables for final smoothed_density update */
                               3171                 :                :     long        new_strategy_delta;
                               3172                 :                :     uint32      new_recent_alloc;
                               3173                 :                : 
                               3174                 :                :     /*
                               3175                 :                :      * Find out where the freelist clock sweep currently is, and how many
                               3176                 :                :      * buffer allocations have happened since our last call.
                               3177                 :                :      */
                               3178                 :          15299 :     strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
                               3179                 :                : 
                               3180                 :                :     /* Report buffer alloc counts to pgstat */
  739 andres@anarazel.de       3181                 :          15299 :     PendingBgWriterStats.buf_alloc += recent_alloc;
                               3182                 :                : 
                               3183                 :                :     /*
                               3184                 :                :      * If we're not running the LRU scan, just stop after doing the stats
                               3185                 :                :      * stuff.  We mark the saved state invalid so that we can recover sanely
                               3186                 :                :      * if LRU scan is turned back on later.
                               3187                 :                :      */
 6046 tgl@sss.pgh.pa.us        3188         [ +  + ]:          15299 :     if (bgwriter_lru_maxpages <= 0)
                               3189                 :                :     {
 6046 tgl@sss.pgh.pa.us        3190                 :GBC          33 :         saved_info_valid = false;
 4462 heikki.linnakangas@i     3191                 :             33 :         return true;
                               3192                 :                :     }
                               3193                 :                : 
                               3194                 :                :     /*
                               3195                 :                :      * Compute strategy_delta = how many buffers have been scanned by the
                               3196                 :                :      * clock sweep since last time.  If first time through, assume none. Then
                               3197                 :                :      * see if we are still ahead of the clock sweep, and if so, how many
                               3198                 :                :      * buffers we could scan before we'd catch up with it and "lap" it. Note:
                               3199                 :                :      * weird-looking coding of xxx_passes comparisons are to avoid bogus
                               3200                 :                :      * behavior when the passes counts wrap around.
                               3201                 :                :      */
 6046 tgl@sss.pgh.pa.us        3202         [ +  + ]:CBC       15266 :     if (saved_info_valid)
                               3203                 :                :     {
 5995 bruce@momjian.us         3204                 :          14536 :         int32       passes_delta = strategy_passes - prev_strategy_passes;
                               3205                 :                : 
 6046 tgl@sss.pgh.pa.us        3206                 :          14536 :         strategy_delta = strategy_buf_id - prev_strategy_buf_id;
 2489                          3207                 :          14536 :         strategy_delta += (long) passes_delta * NBuffers;
                               3208                 :                : 
 6046                          3209         [ -  + ]:          14536 :         Assert(strategy_delta >= 0);
                               3210                 :                : 
                               3211         [ +  + ]:          14536 :         if ((int32) (next_passes - strategy_passes) > 0)
                               3212                 :                :         {
                               3213                 :                :             /* we're one pass ahead of the strategy point */
                               3214                 :           3193 :             bufs_to_lap = strategy_buf_id - next_to_clean;
                               3215                 :                : #ifdef BGW_DEBUG
                               3216                 :                :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                               3217                 :                :                  next_passes, next_to_clean,
                               3218                 :                :                  strategy_passes, strategy_buf_id,
                               3219                 :                :                  strategy_delta, bufs_to_lap);
                               3220                 :                : #endif
                               3221                 :                :         }
                               3222         [ +  + ]:          11343 :         else if (next_passes == strategy_passes &&
                               3223         [ +  + ]:           9018 :                  next_to_clean >= strategy_buf_id)
                               3224                 :                :         {
                               3225                 :                :             /* on same pass, but ahead or at least not behind */
                               3226                 :           8757 :             bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
                               3227                 :                : #ifdef BGW_DEBUG
                               3228                 :                :             elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                               3229                 :                :                  next_passes, next_to_clean,
                               3230                 :                :                  strategy_passes, strategy_buf_id,
                               3231                 :                :                  strategy_delta, bufs_to_lap);
                               3232                 :                : #endif
                               3233                 :                :         }
                               3234                 :                :         else
                               3235                 :                :         {
                               3236                 :                :             /*
                               3237                 :                :              * We're behind, so skip forward to the strategy point and start
                               3238                 :                :              * cleaning from there.
                               3239                 :                :              */
                               3240                 :                : #ifdef BGW_DEBUG
                               3241                 :                :             elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
                               3242                 :                :                  next_passes, next_to_clean,
                               3243                 :                :                  strategy_passes, strategy_buf_id,
                               3244                 :                :                  strategy_delta);
                               3245                 :                : #endif
                               3246                 :           2586 :             next_to_clean = strategy_buf_id;
                               3247                 :           2586 :             next_passes = strategy_passes;
                               3248                 :           2586 :             bufs_to_lap = NBuffers;
                               3249                 :                :         }
                               3250                 :                :     }
                               3251                 :                :     else
                               3252                 :                :     {
                               3253                 :                :         /*
                               3254                 :                :          * Initializing at startup or after LRU scanning had been off. Always
                               3255                 :                :          * start at the strategy point.
                               3256                 :                :          */
                               3257                 :                : #ifdef BGW_DEBUG
                               3258                 :                :         elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
                               3259                 :                :              strategy_passes, strategy_buf_id);
                               3260                 :                : #endif
                               3261                 :            730 :         strategy_delta = 0;
                               3262                 :            730 :         next_to_clean = strategy_buf_id;
                               3263                 :            730 :         next_passes = strategy_passes;
                               3264                 :            730 :         bufs_to_lap = NBuffers;
                               3265                 :                :     }
                               3266                 :                : 
                               3267                 :                :     /* Update saved info for next time */
                               3268                 :          15266 :     prev_strategy_buf_id = strategy_buf_id;
                               3269                 :          15266 :     prev_strategy_passes = strategy_passes;
                               3270                 :          15266 :     saved_info_valid = true;
                               3271                 :                : 
                               3272                 :                :     /*
                               3273                 :                :      * Compute how many buffers had to be scanned for each new allocation, ie,
                               3274                 :                :      * 1/density of reusable buffers, and track a moving average of that.
                               3275                 :                :      *
                               3276                 :                :      * If the strategy point didn't move, we don't update the density estimate
                               3277                 :                :      */
                               3278   [ +  +  +  - ]:          15266 :     if (strategy_delta > 0 && recent_alloc > 0)
                               3279                 :                :     {
                               3280                 :           3479 :         scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
                               3281                 :           3479 :         smoothed_density += (scans_per_alloc - smoothed_density) /
                               3282                 :                :             smoothing_samples;
                               3283                 :                :     }
                               3284                 :                : 
                               3285                 :                :     /*
                               3286                 :                :      * Estimate how many reusable buffers there are between the current
                               3287                 :                :      * strategy point and where we've scanned ahead to, based on the smoothed
                               3288                 :                :      * density estimate.
                               3289                 :                :      */
                               3290                 :          15266 :     bufs_ahead = NBuffers - bufs_to_lap;
                               3291                 :          15266 :     reusable_buffers_est = (float) bufs_ahead / smoothed_density;
                               3292                 :                : 
                               3293                 :                :     /*
                               3294                 :                :      * Track a moving average of recent buffer allocations.  Here, rather than
                               3295                 :                :      * a true average we want a fast-attack, slow-decline behavior: we
                               3296                 :                :      * immediately follow any increase.
                               3297                 :                :      */
                               3298         [ +  + ]:          15266 :     if (smoothed_alloc <= (float) recent_alloc)
                               3299                 :           3500 :         smoothed_alloc = recent_alloc;
                               3300                 :                :     else
                               3301                 :          11766 :         smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
                               3302                 :                :             smoothing_samples;
                               3303                 :                : 
                               3304                 :                :     /* Scale the estimate by a GUC to allow more aggressive tuning. */
 4530                          3305                 :          15266 :     upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
                               3306                 :                : 
                               3307                 :                :     /*
                               3308                 :                :      * If recent_alloc remains at zero for many cycles, smoothed_alloc will
                               3309                 :                :      * eventually underflow to zero, and the underflows produce annoying
                               3310                 :                :      * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
                               3311                 :                :      * zero, there's no point in tracking smaller and smaller values of
                               3312                 :                :      * smoothed_alloc, so just reset it to exactly zero to avoid this
                               3313                 :                :      * syndrome.  It will pop back up as soon as recent_alloc increases.
                               3314                 :                :      */
                               3315         [ +  + ]:          15266 :     if (upcoming_alloc_est == 0)
                               3316                 :           1841 :         smoothed_alloc = 0;
                               3317                 :                : 
                               3318                 :                :     /*
                               3319                 :                :      * Even in cases where there's been little or no buffer allocation
                               3320                 :                :      * activity, we want to make a small amount of progress through the buffer
                               3321                 :                :      * cache so that as many reusable buffers as possible are clean after an
                               3322                 :                :      * idle period.
                               3323                 :                :      *
                               3324                 :                :      * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
                               3325                 :                :      * the BGW will be called during the scan_whole_pool time; slice the
                               3326                 :                :      * buffer pool into that many sections.
                               3327                 :                :      */
 6046                          3328                 :          15266 :     min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
                               3329                 :                : 
                               3330         [ +  + ]:          15266 :     if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
                               3331                 :                :     {
                               3332                 :                : #ifdef BGW_DEBUG
                               3333                 :                :         elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
                               3334                 :                :              upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
                               3335                 :                : #endif
                               3336                 :           7838 :         upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
                               3337                 :                :     }
                               3338                 :                : 
                               3339                 :                :     /*
                               3340                 :                :      * Now write out dirty reusable buffers, working forward from the
                               3341                 :                :      * next_to_clean point, until we have lapped the strategy scan, or cleaned
                               3342                 :                :      * enough buffers to match our estimate of the next cycle's allocation
                               3343                 :                :      * requirements, or hit the bgwriter_lru_maxpages limit.
                               3344                 :                :      */
                               3345                 :                : 
                               3346                 :          15266 :     num_to_scan = bufs_to_lap;
                               3347                 :          15266 :     num_written = 0;
                               3348                 :          15266 :     reusable_buffers = reusable_buffers_est;
                               3349                 :                : 
                               3350                 :                :     /* Execute the LRU scan */
                               3351   [ +  +  +  + ]:        1679728 :     while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
                               3352                 :                :     {
 2926 andres@anarazel.de       3353                 :        1664473 :         int         sync_state = SyncOneBuffer(next_to_clean, true,
                               3354                 :                :                                                wb_context);
                               3355                 :                : 
 6046 tgl@sss.pgh.pa.us        3356         [ +  + ]:        1664473 :         if (++next_to_clean >= NBuffers)
                               3357                 :                :         {
                               3358                 :           3065 :             next_to_clean = 0;
                               3359                 :           3065 :             next_passes++;
                               3360                 :                :         }
                               3361                 :        1664473 :         num_to_scan--;
                               3362                 :                : 
 2926 andres@anarazel.de       3363         [ +  + ]:        1664473 :         if (sync_state & BUF_WRITTEN)
                               3364                 :                :         {
 6046 tgl@sss.pgh.pa.us        3365                 :          34633 :             reusable_buffers++;
                               3366         [ +  + ]:          34633 :             if (++num_written >= bgwriter_lru_maxpages)
                               3367                 :                :             {
  739 andres@anarazel.de       3368                 :             11 :                 PendingBgWriterStats.maxwritten_clean++;
 6046 tgl@sss.pgh.pa.us        3369                 :             11 :                 break;
                               3370                 :                :             }
                               3371                 :                :         }
 2926 andres@anarazel.de       3372         [ +  + ]:        1629840 :         else if (sync_state & BUF_REUSABLE)
 6046 tgl@sss.pgh.pa.us        3373                 :        1177703 :             reusable_buffers++;
                               3374                 :                :     }
                               3375                 :                : 
  739 andres@anarazel.de       3376                 :          15266 :     PendingBgWriterStats.buf_written_clean += num_written;
                               3377                 :                : 
                               3378                 :                : #ifdef BGW_DEBUG
                               3379                 :                :     elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
                               3380                 :                :          recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
                               3381                 :                :          smoothed_density, reusable_buffers_est, upcoming_alloc_est,
                               3382                 :                :          bufs_to_lap - num_to_scan,
                               3383                 :                :          num_written,
                               3384                 :                :          reusable_buffers - reusable_buffers_est);
                               3385                 :                : #endif
                               3386                 :                : 
                               3387                 :                :     /*
                               3388                 :                :      * Consider the above scan as being like a new allocation scan.
                               3389                 :                :      * Characterize its density and update the smoothed one based on it. This
                               3390                 :                :      * effectively halves the moving average period in cases where both the
                               3391                 :                :      * strategy and the background writer are doing some useful scanning,
                               3392                 :                :      * which is helpful because a long memory isn't as desirable on the
                               3393                 :                :      * density estimates.
                               3394                 :                :      */
 4358 tgl@sss.pgh.pa.us        3395                 :          15266 :     new_strategy_delta = bufs_to_lap - num_to_scan;
                               3396                 :          15266 :     new_recent_alloc = reusable_buffers - reusable_buffers_est;
                               3397   [ +  +  +  + ]:          15266 :     if (new_strategy_delta > 0 && new_recent_alloc > 0)
                               3398                 :                :     {
                               3399                 :          11156 :         scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
 6046                          3400                 :          11156 :         smoothed_density += (scans_per_alloc - smoothed_density) /
                               3401                 :                :             smoothing_samples;
                               3402                 :                : 
                               3403                 :                : #ifdef BGW_DEBUG
                               3404                 :                :         elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
                               3405                 :                :              new_recent_alloc, new_strategy_delta,
                               3406                 :                :              scans_per_alloc, smoothed_density);
                               3407                 :                : #endif
                               3408                 :                :     }
                               3409                 :                : 
                               3410                 :                :     /* Return true if OK to hibernate */
 4358                          3411   [ +  +  +  + ]:          15266 :     return (bufs_to_lap == 0 && recent_alloc == 0);
                               3412                 :                : }
                               3413                 :                : 
                               3414                 :                : /*
                               3415                 :                :  * SyncOneBuffer -- process a single buffer during syncing.
                               3416                 :                :  *
                               3417                 :                :  * If skip_recently_used is true, we don't write currently-pinned buffers, nor
                               3418                 :                :  * buffers marked recently used, as these are not replacement candidates.
                               3419                 :                :  *
                               3420                 :                :  * Returns a bitmask containing the following flag bits:
                               3421                 :                :  *  BUF_WRITTEN: we wrote the buffer.
                               3422                 :                :  *  BUF_REUSABLE: buffer is available for replacement, ie, it has
                               3423                 :                :  *      pin count 0 and usage count 0.
                               3424                 :                :  *
                               3425                 :                :  * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
                               3426                 :                :  * after locking it, but we don't care all that much.)
                               3427                 :                :  */
                               3428                 :                : static int
 2977 andres@anarazel.de       3429                 :        1904150 : SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
                               3430                 :                : {
 3072 rhaas@postgresql.org     3431                 :        1904150 :     BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
 5995 bruce@momjian.us         3432                 :        1904150 :     int         result = 0;
                               3433                 :                :     uint32      buf_state;
                               3434                 :                :     BufferTag   tag;
                               3435                 :                : 
                               3436                 :                :     /* Make sure we can handle the pin */
 3373 andres@anarazel.de       3437                 :        1904150 :     ReservePrivateRefCountEntry();
  158 heikki.linnakangas@i     3438                 :GNC     1904150 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               3439                 :                : 
                               3440                 :                :     /*
                               3441                 :                :      * Check whether buffer needs writing.
                               3442                 :                :      *
                               3443                 :                :      * We can make this check without taking the buffer content lock so long
                               3444                 :                :      * as we mark pages dirty in access methods *before* logging changes with
                               3445                 :                :      * XLogInsert(): if someone marks the buffer dirty just after our check we
                               3446                 :                :      * don't worry because our checkpoint.redo points before log record for
                               3447                 :                :      * upcoming changes and so we are not required to write such dirty buffer.
                               3448                 :                :      */
 2926 andres@anarazel.de       3449                 :CBC     1904150 :     buf_state = LockBufHdr(bufHdr);
                               3450                 :                : 
                               3451         [ +  + ]:        1904150 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
                               3452         [ +  + ]:        1901537 :         BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
                               3453                 :                :     {
 6046 tgl@sss.pgh.pa.us        3454                 :        1213011 :         result |= BUF_REUSABLE;
                               3455                 :                :     }
                               3456         [ +  + ]:         691139 :     else if (skip_recently_used)
                               3457                 :                :     {
                               3458                 :                :         /* Caller told us not to write recently-used buffers */
 2926 andres@anarazel.de       3459                 :         452137 :         UnlockBufHdr(bufHdr, buf_state);
 6046 tgl@sss.pgh.pa.us        3460                 :         452137 :         return result;
                               3461                 :                :     }
                               3462                 :                : 
 2926 andres@anarazel.de       3463   [ +  +  +  + ]:        1452013 :     if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
                               3464                 :                :     {
                               3465                 :                :         /* It's clean, so nothing to do */
                               3466                 :        1177703 :         UnlockBufHdr(bufHdr, buf_state);
 6046 tgl@sss.pgh.pa.us        3467                 :        1177703 :         return result;
                               3468                 :                :     }
                               3469                 :                : 
                               3470                 :                :     /*
                               3471                 :                :      * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
                               3472                 :                :      * buffer is clean by the time we've locked it.)
                               3473                 :                :      */
 6981                          3474                 :         274310 :     PinBuffer_Locked(bufHdr);
 3043 rhaas@postgresql.org     3475                 :         274310 :     LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
                               3476                 :                : 
  430 andres@anarazel.de       3477                 :         274310 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
                               3478                 :                : 
 3043 rhaas@postgresql.org     3479                 :         274310 :     LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
                               3480                 :                : 
 2977 andres@anarazel.de       3481                 :         274310 :     tag = bufHdr->tag;
                               3482                 :                : 
  562 michael@paquier.xyz      3483                 :         274310 :     UnpinBuffer(bufHdr);
                               3484                 :                : 
                               3485                 :                :     /*
                               3486                 :                :      * SyncOneBuffer() is only called by checkpointer and bgwriter, so
                               3487                 :                :      * IOContext will always be IOCONTEXT_NORMAL.
                               3488                 :                :      */
  333 andres@anarazel.de       3489                 :         274310 :     ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
                               3490                 :                : 
 6046 tgl@sss.pgh.pa.us        3491                 :         274310 :     return result | BUF_WRITTEN;
                               3492                 :                : }
                               3493                 :                : 
                               3494                 :                : /*
                               3495                 :                :  *      AtEOXact_Buffers - clean up at end of transaction.
                               3496                 :                :  *
                               3497                 :                :  *      As of PostgreSQL 8.0, buffer pins should get released by the
                               3498                 :                :  *      ResourceOwner mechanism.  This routine is just a debugging
                               3499                 :                :  *      cross-check that no pins remain.
                               3500                 :                :  */
                               3501                 :                : void
 7922                          3502                 :         432909 : AtEOXact_Buffers(bool isCommit)
                               3503                 :                : {
 3586 andres@anarazel.de       3504                 :         432909 :     CheckForBufferLeaks();
                               3505                 :                : 
 7120 tgl@sss.pgh.pa.us        3506                 :         432909 :     AtEOXact_LocalBuffers(isCommit);
                               3507                 :                : 
 3515 andres@anarazel.de       3508         [ -  + ]:         432909 :     Assert(PrivateRefCountOverflowed == 0);
                               3509                 :         432909 : }
                               3510                 :                : 
                               3511                 :                : /*
                               3512                 :                :  * Initialize access to shared buffer pool
                               3513                 :                :  *
                               3514                 :                :  * This is called during backend startup (whether standalone or under the
                               3515                 :                :  * postmaster).  It sets up for this backend's access to the already-existing
                               3516                 :                :  * buffer pool.
                               3517                 :                :  */
                               3518                 :                : void
                               3519                 :          19575 : InitBufferPoolAccess(void)
                               3520                 :                : {
                               3521                 :                :     HASHCTL     hash_ctl;
                               3522                 :                : 
                               3523                 :          19575 :     memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
                               3524                 :                : 
                               3525                 :          19575 :     hash_ctl.keysize = sizeof(int32);
 2975                          3526                 :          19575 :     hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
                               3527                 :                : 
 3515                          3528                 :          19575 :     PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
                               3529                 :                :                                       HASH_ELEM | HASH_BLOBS);
                               3530                 :                : 
                               3531                 :                :     /*
                               3532                 :                :      * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
                               3533                 :                :      * the corresponding phase of backend shutdown.
                               3534                 :                :      */
  983                          3535         [ -  + ]:          19575 :     Assert(MyProc != NULL);
 6824 tgl@sss.pgh.pa.us        3536                 :          19575 :     on_shmem_exit(AtProcExit_Buffers, 0);
                               3537                 :          19575 : }
                               3538                 :                : 
                               3539                 :                : /*
                               3540                 :                :  * During backend exit, ensure that we released all shared-buffer locks and
                               3541                 :                :  * assert that we have no remaining pins.
                               3542                 :                :  */
                               3543                 :                : static void
                               3544                 :          18045 : AtProcExit_Buffers(int code, Datum arg)
                               3545                 :                : {
 7120                          3546                 :          18045 :     UnlockBuffers();
                               3547                 :                : 
 3586 andres@anarazel.de       3548                 :          18045 :     CheckForBufferLeaks();
                               3549                 :                : 
                               3550                 :                :     /* localbuf.c needs a chance too */
                               3551                 :          18045 :     AtProcExit_LocalBuffers();
                               3552                 :          18045 : }
                               3553                 :                : 
                               3554                 :                : /*
                               3555                 :                :  *      CheckForBufferLeaks - ensure this backend holds no buffer pins
                               3556                 :                :  *
                               3557                 :                :  *      As of PostgreSQL 8.0, buffer pins should get released by the
                               3558                 :                :  *      ResourceOwner mechanism.  This routine is just a debugging
                               3559                 :                :  *      cross-check that no pins remain.
                               3560                 :                :  */
                               3561                 :                : static void
                               3562                 :         450954 : CheckForBufferLeaks(void)
                               3563                 :                : {
                               3564                 :                : #ifdef USE_ASSERT_CHECKING
                               3565                 :         450954 :     int         RefCountErrors = 0;
                               3566                 :                :     PrivateRefCountEntry *res;
                               3567                 :                :     int         i;
                               3568                 :                :     char       *s;
                               3569                 :                : 
                               3570                 :                :     /* check the array */
 3515                          3571         [ +  + ]:        4058586 :     for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
                               3572                 :                :     {
                               3573                 :        3607632 :         res = &PrivateRefCountArray[i];
                               3574                 :                : 
                               3575         [ -  + ]:        3607632 :         if (res->buffer != InvalidBuffer)
                               3576                 :                :         {
  158 heikki.linnakangas@i     3577                 :UNC           0 :             s = DebugPrintBufferRefcount(res->buffer);
                               3578         [ #  # ]:              0 :             elog(WARNING, "buffer refcount leak: %s", s);
                               3579                 :              0 :             pfree(s);
                               3580                 :                : 
 3515 andres@anarazel.de       3581                 :UBC           0 :             RefCountErrors++;
                               3582                 :                :         }
                               3583                 :                :     }
                               3584                 :                : 
                               3585                 :                :     /* if necessary search the hash */
 3515 andres@anarazel.de       3586         [ -  + ]:CBC      450954 :     if (PrivateRefCountOverflowed)
                               3587                 :                :     {
                               3588                 :                :         HASH_SEQ_STATUS hstat;
                               3589                 :                : 
 3515 andres@anarazel.de       3590                 :UBC           0 :         hash_seq_init(&hstat, PrivateRefCountHash);
                               3591         [ #  # ]:              0 :         while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
                               3592                 :                :         {
  158 heikki.linnakangas@i     3593                 :UNC           0 :             s = DebugPrintBufferRefcount(res->buffer);
                               3594         [ #  # ]:              0 :             elog(WARNING, "buffer refcount leak: %s", s);
                               3595                 :              0 :             pfree(s);
 3586 andres@anarazel.de       3596                 :UBC           0 :             RefCountErrors++;
                               3597                 :                :         }
                               3598                 :                :     }
                               3599                 :                : 
 3586 andres@anarazel.de       3600         [ -  + ]:CBC      450954 :     Assert(RefCountErrors == 0);
                               3601                 :                : #endif
 7120 tgl@sss.pgh.pa.us        3602                 :         450954 : }
                               3603                 :                : 
                               3604                 :                : /*
                               3605                 :                :  * Helper routine to issue warnings when a buffer is unexpectedly pinned
                               3606                 :                :  */
                               3607                 :                : char *
  158 heikki.linnakangas@i     3608                 :UNC           0 : DebugPrintBufferRefcount(Buffer buffer)
                               3609                 :                : {
                               3610                 :                :     BufferDesc *buf;
                               3611                 :                :     int32       loccount;
                               3612                 :                :     char       *path;
                               3613                 :                :     char       *result;
                               3614                 :                :     ProcNumber  backend;
                               3615                 :                :     uint32      buf_state;
                               3616                 :                : 
 7120 tgl@sss.pgh.pa.us        3617         [ #  # ]:UBC           0 :     Assert(BufferIsValid(buffer));
                               3618         [ #  # ]:              0 :     if (BufferIsLocal(buffer))
                               3619                 :                :     {
 3363 andres@anarazel.de       3620                 :              0 :         buf = GetLocalBufferDescriptor(-buffer - 1);
 7120 tgl@sss.pgh.pa.us        3621                 :              0 :         loccount = LocalRefCount[-buffer - 1];
   42 heikki.linnakangas@i     3622                 :UNC           0 :         backend = MyProcNumber;
                               3623                 :                :     }
                               3624                 :                :     else
                               3625                 :                :     {
 3363 andres@anarazel.de       3626                 :UBC           0 :         buf = GetBufferDescriptor(buffer - 1);
 3515                          3627                 :              0 :         loccount = GetPrivateRefCount(buffer);
   42 heikki.linnakangas@i     3628                 :UNC           0 :         backend = INVALID_PROC_NUMBER;
                               3629                 :                :     }
                               3630                 :                : 
                               3631                 :                :     /* theoretically we should lock the bufhdr here */
  599 rhaas@postgresql.org     3632                 :UBC           0 :     path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
                               3633                 :                :                           BufTagGetForkNum(&buf->tag));
 2926 andres@anarazel.de       3634                 :              0 :     buf_state = pg_atomic_read_u32(&buf->state);
                               3635                 :                : 
  158 heikki.linnakangas@i     3636                 :UNC           0 :     result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
                               3637                 :                :                       buffer, path,
                               3638                 :                :                       buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
                               3639                 :                :                       BUF_STATE_GET_REFCOUNT(buf_state), loccount);
 5633 heikki.linnakangas@i     3640                 :UBC           0 :     pfree(path);
  158 heikki.linnakangas@i     3641                 :UNC           0 :     return result;
 7227 tgl@sss.pgh.pa.us        3642                 :EUB             : }
                               3643                 :                : 
                               3644                 :                : /*
                               3645                 :                :  * CheckPointBuffers
                               3646                 :                :  *
                               3647                 :                :  * Flush all dirty blocks in buffer pool to disk at checkpoint time.
                               3648                 :                :  *
                               3649                 :                :  * Note: temporary relations do not participate in checkpoints, so they don't
                               3650                 :                :  * need to be flushed.
                               3651                 :                :  */
                               3652                 :                : void
 6135 tgl@sss.pgh.pa.us        3653                 :CBC        1153 : CheckPointBuffers(int flags)
                               3654                 :                : {
                               3655                 :           1153 :     BufferSync(flags);
 8536 vadim4o@yahoo.com        3656                 :           1148 : }
                               3657                 :                : 
                               3658                 :                : /*
                               3659                 :                :  * BufferGetBlockNumber
                               3660                 :                :  *      Returns the block number associated with a buffer.
                               3661                 :                :  *
                               3662                 :                :  * Note:
                               3663                 :                :  *      Assumes that the buffer is valid and pinned, else the
                               3664                 :                :  *      value may be obsolete immediately...
                               3665                 :                :  */
                               3666                 :                : BlockNumber
10141 scrappy@hub.org          3667                 :       80255914 : BufferGetBlockNumber(Buffer buffer)
                               3668                 :                : {
                               3669                 :                :     BufferDesc *bufHdr;
                               3670                 :                : 
 8035 bruce@momjian.us         3671   [ -  +  +  +  :       80255914 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3672                 :                : 
 9716                          3673         [ +  + ]:       80255914 :     if (BufferIsLocal(buffer))
 3363 andres@anarazel.de       3674                 :        2359292 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               3675                 :                :     else
                               3676                 :       77896622 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               3677                 :                : 
                               3678                 :                :     /* pinned, so OK to read tag without spinlock */
 6981 tgl@sss.pgh.pa.us        3679                 :       80255914 :     return bufHdr->tag.blockNum;
                               3680                 :                : }
                               3681                 :                : 
                               3682                 :                : /*
                               3683                 :                :  * BufferGetTag
                               3684                 :                :  *      Returns the relfilelocator, fork number and block number associated with
                               3685                 :                :  *      a buffer.
                               3686                 :                :  */
                               3687                 :                : void
  648 rhaas@postgresql.org     3688                 :       13945493 : BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
                               3689                 :                :              BlockNumber *blknum)
                               3690                 :                : {
                               3691                 :                :     BufferDesc *bufHdr;
                               3692                 :                : 
                               3693                 :                :     /* Do the same checks as BufferGetBlockNumber. */
 5725 heikki.linnakangas@i     3694   [ -  +  -  +  :       13945493 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3695                 :                : 
 7300 tgl@sss.pgh.pa.us        3696         [ -  + ]:       13945493 :     if (BufferIsLocal(buffer))
 3363 andres@anarazel.de       3697                 :UBC           0 :         bufHdr = GetLocalBufferDescriptor(-buffer - 1);
                               3698                 :                :     else
 3363 andres@anarazel.de       3699                 :CBC    13945493 :         bufHdr = GetBufferDescriptor(buffer - 1);
                               3700                 :                : 
                               3701                 :                :     /* pinned, so OK to read tag without spinlock */
  599 rhaas@postgresql.org     3702                 :       13945493 :     *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
                               3703                 :       13945493 :     *forknum = BufTagGetForkNum(&bufHdr->tag);
 5725 heikki.linnakangas@i     3704                 :       13945493 :     *blknum = bufHdr->tag.blockNum;
 7300 tgl@sss.pgh.pa.us        3705                 :       13945493 : }
                               3706                 :                : 
                               3707                 :                : /*
                               3708                 :                :  * FlushBuffer
                               3709                 :                :  *      Physically write out a shared buffer.
                               3710                 :                :  *
                               3711                 :                :  * NOTE: this actually just passes the buffer contents to the kernel; the
                               3712                 :                :  * real write to disk won't happen until the kernel feels like it.  This
                               3713                 :                :  * is okay from our point of view since we can redo the changes from WAL.
                               3714                 :                :  * However, we will need to force the changes to disk via fsync before
                               3715                 :                :  * we can checkpoint WAL.
                               3716                 :                :  *
                               3717                 :                :  * The caller must hold a pin on the buffer and have share-locked the
                               3718                 :                :  * buffer contents.  (Note: a share-lock does not prevent updates of
                               3719                 :                :  * hint bits in the buffer, so the page could change while the write
                               3720                 :                :  * is in progress, but we assume that that will not invalidate the data
                               3721                 :                :  * written.)
                               3722                 :                :  *
                               3723                 :                :  * If the caller has an smgr reference for the buffer's relation, pass it
                               3724                 :                :  * as the second parameter.  If not, pass NULL.
                               3725                 :                :  */
                               3726                 :                : static void
  430 andres@anarazel.de       3727                 :         530338 : FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
                               3728                 :                :             IOContext io_context)
                               3729                 :                : {
                               3730                 :                :     XLogRecPtr  recptr;
                               3731                 :                :     ErrorContextCallback errcallback;
                               3732                 :                :     instr_time  io_start;
                               3733                 :                :     Block       bufBlock;
                               3734                 :                :     char       *bufToWrite;
                               3735                 :                :     uint32      buf_state;
                               3736                 :                : 
                               3737                 :                :     /*
                               3738                 :                :      * Try to start an I/O operation.  If StartBufferIO returns false, then
                               3739                 :                :      * someone else flushed the buffer before we could, so we need not do
                               3740                 :                :      * anything.
                               3741                 :                :      */
   11 tmunro@postgresql.or     3742         [ -  + ]:GNC      530338 :     if (!StartBufferIO(buf, false, false))
 6981 tgl@sss.pgh.pa.us        3743                 :UBC           0 :         return;
                               3744                 :                : 
                               3745                 :                :     /* Setup error traceback support for ereport() */
 4171 heikki.linnakangas@i     3746                 :CBC      530338 :     errcallback.callback = shared_buffer_write_error_callback;
                               3747                 :         530338 :     errcallback.arg = (void *) buf;
                               3748                 :         530338 :     errcallback.previous = error_context_stack;
                               3749                 :         530338 :     error_context_stack = &errcallback;
                               3750                 :                : 
                               3751                 :                :     /* Find smgr relation for buffer */
 7298 tgl@sss.pgh.pa.us        3752         [ +  + ]:         530338 :     if (reln == NULL)
   42 heikki.linnakangas@i     3753                 :GNC      526808 :         reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER);
                               3754                 :                : 
                               3755                 :                :     TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
                               3756                 :                :                                         buf->tag.blockNum,
                               3757                 :                :                                         reln->smgr_rlocator.locator.spcOid,
                               3758                 :                :                                         reln->smgr_rlocator.locator.dbOid,
                               3759                 :                :                                         reln->smgr_rlocator.locator.relNumber);
                               3760                 :                : 
 2926 andres@anarazel.de       3761                 :CBC      530338 :     buf_state = LockBufHdr(buf);
                               3762                 :                : 
                               3763                 :                :     /*
                               3764                 :                :      * Run PageGetLSN while holding header lock, since we don't have the
                               3765                 :                :      * buffer locked exclusively in all cases.
                               3766                 :                :      */
 4041 simon@2ndQuadrant.co     3767                 :         530338 :     recptr = BufferGetLSN(buf);
                               3768                 :                : 
                               3769                 :                :     /* To check if block content changes while flushing. - vadim 01/17/97 */
 2926 andres@anarazel.de       3770                 :         530338 :     buf_state &= ~BM_JUST_DIRTIED;
                               3771                 :         530338 :     UnlockBufHdr(buf, buf_state);
                               3772                 :                : 
                               3773                 :                :     /*
                               3774                 :                :      * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
                               3775                 :                :      * rule that log updates must hit disk before any of the data-file changes
                               3776                 :                :      * they describe do.
                               3777                 :                :      *
                               3778                 :                :      * However, this rule does not apply to unlogged relations, which will be
                               3779                 :                :      * lost after a crash anyway.  Most unlogged relation pages do not bear
                               3780                 :                :      * LSNs since we never emit WAL records for them, and therefore flushing
                               3781                 :                :      * up through the buffer LSN would be useless, but harmless.  However,
                               3782                 :                :      * GiST indexes use LSNs internally to track page-splits, and therefore
                               3783                 :                :      * unlogged GiST pages bear "fake" LSNs generated by
                               3784                 :                :      * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
                               3785                 :                :      * LSN counter could advance past the WAL insertion point; and if it did
                               3786                 :                :      * happen, attempting to flush WAL through that location would fail, with
                               3787                 :                :      * disastrous system-wide consequences.  To make sure that can't happen,
                               3788                 :                :      * skip the flush if the buffer isn't permanent.
                               3789                 :                :      */
                               3790         [ +  + ]:         530338 :     if (buf_state & BM_PERMANENT)
 4080 heikki.linnakangas@i     3791                 :         528285 :         XLogFlush(recptr);
                               3792                 :                : 
                               3793                 :                :     /*
                               3794                 :                :      * Now it's safe to write buffer to disk. Note that no one else should
                               3795                 :                :      * have been able to write it while we were busy with log flushing because
                               3796                 :                :      * only one process at a time can set the BM_IO_IN_PROGRESS bit.
                               3797                 :                :      */
 4041 simon@2ndQuadrant.co     3798                 :         530338 :     bufBlock = BufHdrGetBlock(buf);
                               3799                 :                : 
                               3800                 :                :     /*
                               3801                 :                :      * Update page checksum if desired.  Since we have only shared lock on the
                               3802                 :                :      * buffer, other processes might be updating hint bits in it, so we must
                               3803                 :                :      * copy the page to private storage if we do checksumming.
                               3804                 :                :      */
                               3805                 :         530338 :     bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
                               3806                 :                : 
  120 michael@paquier.xyz      3807                 :GNC      530338 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               3808                 :                : 
                               3809                 :                :     /*
                               3810                 :                :      * bufToWrite is either the shared buffer or a copy, as appropriate.
                               3811                 :                :      */
 7369 tgl@sss.pgh.pa.us        3812                 :CBC      530338 :     smgrwrite(reln,
  599 rhaas@postgresql.org     3813                 :         530338 :               BufTagGetForkNum(&buf->tag),
                               3814                 :                :               buf->tag.blockNum,
                               3815                 :                :               bufToWrite,
                               3816                 :                :               false);
                               3817                 :                : 
                               3818                 :                :     /*
                               3819                 :                :      * When a strategy is in use, only flushes of dirty buffers already in the
                               3820                 :                :      * strategy ring are counted as strategy writes (IOCONTEXT
                               3821                 :                :      * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
                               3822                 :                :      * statistics tracking.
                               3823                 :                :      *
                               3824                 :                :      * If a shared buffer initially added to the ring must be flushed before
                               3825                 :                :      * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
                               3826                 :                :      *
                               3827                 :                :      * If a shared buffer which was added to the ring later because the
                               3828                 :                :      * current strategy buffer is pinned or in use or because all strategy
                               3829                 :                :      * buffers were dirty and rejected (for BAS_BULKREAD operations only)
                               3830                 :                :      * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
                               3831                 :                :      * (from_ring will be false).
                               3832                 :                :      *
                               3833                 :                :      * When a strategy is not in use, the write can only be a "regular" write
                               3834                 :                :      * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
                               3835                 :                :      */
  373 andres@anarazel.de       3836                 :         530338 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
                               3837                 :                :                             IOOP_WRITE, io_start, 1);
                               3838                 :                : 
 5234 rhaas@postgresql.org     3839                 :         530338 :     pgBufferUsage.shared_blks_written++;
                               3840                 :                : 
                               3841                 :                :     /*
                               3842                 :                :      * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
                               3843                 :                :      * end the BM_IO_IN_PROGRESS state.
                               3844                 :                :      */
  158 heikki.linnakangas@i     3845                 :GNC      530338 :     TerminateBufferIO(buf, true, 0, true);
                               3846                 :                : 
                               3847                 :                :     TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
                               3848                 :                :                                        buf->tag.blockNum,
                               3849                 :                :                                        reln->smgr_rlocator.locator.spcOid,
                               3850                 :                :                                        reln->smgr_rlocator.locator.dbOid,
                               3851                 :                :                                        reln->smgr_rlocator.locator.relNumber);
                               3852                 :                : 
                               3853                 :                :     /* Pop the error context stack */
 4171 heikki.linnakangas@i     3854                 :CBC      530338 :     error_context_stack = errcallback.previous;
                               3855                 :                : }
                               3856                 :                : 
                               3857                 :                : /*
                               3858                 :                :  * RelationGetNumberOfBlocksInFork
                               3859                 :                :  *      Determines the current number of pages in the specified relation fork.
                               3860                 :                :  *
                               3861                 :                :  * Note that the accuracy of the result will depend on the details of the
                               3862                 :                :  * relation's storage. For builtin AMs it'll be accurate, but for external AMs
                               3863                 :                :  * it might not be.
                               3864                 :                :  */
                               3865                 :                : BlockNumber
 4855 rhaas@postgresql.org     3866                 :        1537475 : RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
                               3867                 :                : {
  863 peter@eisentraut.org     3868   [ +  +  +  +  :        1537475 :     if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
                                              +  + ]
                               3869                 :                :     {
                               3870                 :                :         /*
                               3871                 :                :          * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
                               3872                 :                :          * tableam returns the size in bytes - but for the purpose of this
                               3873                 :                :          * routine, we want the number of blocks. Therefore divide, rounding
                               3874                 :                :          * up.
                               3875                 :                :          */
                               3876                 :                :         uint64      szbytes;
                               3877                 :                : 
                               3878                 :        1161522 :         szbytes = table_relation_size(relation, forkNum);
                               3879                 :                : 
                               3880                 :        1161503 :         return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
                               3881                 :                :     }
                               3882   [ +  -  +  +  :         375953 :     else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
                                     -  +  -  -  -  
                                                 - ]
                               3883                 :                :     {
  703 tgl@sss.pgh.pa.us        3884                 :         375953 :         return smgrnblocks(RelationGetSmgr(relation), forkNum);
                               3885                 :                :     }
                               3886                 :                :     else
  863 peter@eisentraut.org     3887                 :UBC           0 :         Assert(false);
                               3888                 :                : 
                               3889                 :                :     return 0;                   /* keep compiler quiet */
                               3890                 :                : }
                               3891                 :                : 
                               3892                 :                : /*
                               3893                 :                :  * BufferIsPermanent
                               3894                 :                :  *      Determines whether a buffer will potentially still be around after
                               3895                 :                :  *      a crash.  Caller must hold a buffer pin.
                               3896                 :                :  */
                               3897                 :                : bool
 4552 rhaas@postgresql.org     3898                 :CBC     8944186 : BufferIsPermanent(Buffer buffer)
                               3899                 :                : {
                               3900                 :                :     BufferDesc *bufHdr;
                               3901                 :                : 
                               3902                 :                :     /* Local buffers are used only for temp relations. */
                               3903         [ +  + ]:        8944186 :     if (BufferIsLocal(buffer))
                               3904                 :         672546 :         return false;
                               3905                 :                : 
                               3906                 :                :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
                               3907         [ -  + ]:        8271640 :     Assert(BufferIsValid(buffer));
                               3908   [ -  +  -  +  :        8271640 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3909                 :                : 
                               3910                 :                :     /*
                               3911                 :                :      * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
                               3912                 :                :      * need not bother with the buffer header spinlock.  Even if someone else
                               3913                 :                :      * changes the buffer header state while we're doing this, the state is
                               3914                 :                :      * changed atomically, so we'll read the old value or the new value, but
                               3915                 :                :      * not random garbage.
                               3916                 :                :      */
 3363 andres@anarazel.de       3917                 :        8271640 :     bufHdr = GetBufferDescriptor(buffer - 1);
 2926                          3918                 :        8271640 :     return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
                               3919                 :                : }
                               3920                 :                : 
                               3921                 :                : /*
                               3922                 :                :  * BufferGetLSNAtomic
                               3923                 :                :  *      Retrieves the LSN of the buffer atomically using a buffer header lock.
                               3924                 :                :  *      This is necessary for some callers who may not have an exclusive lock
                               3925                 :                :  *      on the buffer.
                               3926                 :                :  */
                               3927                 :                : XLogRecPtr
 4041 simon@2ndQuadrant.co     3928                 :        7592142 : BufferGetLSNAtomic(Buffer buffer)
                               3929                 :                : {
 3072 rhaas@postgresql.org     3930                 :        7592142 :     BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
 2916 kgrittn@postgresql.o     3931                 :        7592142 :     char       *page = BufferGetPage(buffer);
                               3932                 :                :     XLogRecPtr  lsn;
                               3933                 :                :     uint32      buf_state;
                               3934                 :                : 
                               3935                 :                :     /*
                               3936                 :                :      * If we don't need locking for correctness, fastpath out.
                               3937                 :                :      */
 3215 heikki.linnakangas@i     3938   [ +  +  +  +  :        7592142 :     if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
                                              +  + ]
 4041 simon@2ndQuadrant.co     3939                 :        5889490 :         return PageGetLSN(page);
                               3940                 :                : 
                               3941                 :                :     /* Make sure we've got a real buffer, and that we hold a pin on it. */
                               3942         [ -  + ]:        1702652 :     Assert(BufferIsValid(buffer));
                               3943   [ -  +  -  +  :        1702652 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               3944                 :                : 
 2926 andres@anarazel.de       3945                 :        1702652 :     buf_state = LockBufHdr(bufHdr);
 4041 simon@2ndQuadrant.co     3946                 :        1702652 :     lsn = PageGetLSN(page);
 2926 andres@anarazel.de       3947                 :        1702652 :     UnlockBufHdr(bufHdr, buf_state);
                               3948                 :                : 
 4041 simon@2ndQuadrant.co     3949                 :        1702652 :     return lsn;
                               3950                 :                : }
                               3951                 :                : 
                               3952                 :                : /* ---------------------------------------------------------------------
                               3953                 :                :  *      DropRelationBuffers
                               3954                 :                :  *
                               3955                 :                :  *      This function removes from the buffer pool all the pages of the
                               3956                 :                :  *      specified relation forks that have block numbers >= firstDelBlock.
                               3957                 :                :  *      (In particular, with firstDelBlock = 0, all pages are removed.)
                               3958                 :                :  *      Dirty pages are simply dropped, without bothering to write them
                               3959                 :                :  *      out first.  Therefore, this is NOT rollback-able, and so should be
                               3960                 :                :  *      used only with extreme caution!
                               3961                 :                :  *
                               3962                 :                :  *      Currently, this is called only from smgr.c when the underlying file
                               3963                 :                :  *      is about to be deleted or truncated (firstDelBlock is needed for
                               3964                 :                :  *      the truncation case).  The data in the affected pages would therefore
                               3965                 :                :  *      be deleted momentarily anyway, and there is no point in writing it.
                               3966                 :                :  *      It is the responsibility of higher-level code to ensure that the
                               3967                 :                :  *      deletion or truncation does not lose any data that could be needed
                               3968                 :                :  *      later.  It is also the responsibility of higher-level code to ensure
                               3969                 :                :  *      that no other process could be trying to load more pages of the
                               3970                 :                :  *      relation into buffers.
                               3971                 :                :  * --------------------------------------------------------------------
                               3972                 :                :  */
                               3973                 :                : void
  642 rhaas@postgresql.org     3974                 :            578 : DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
                               3975                 :                :                     int nforks, BlockNumber *firstDelBlock)
                               3976                 :                : {
                               3977                 :                :     int         i;
                               3978                 :                :     int         j;
                               3979                 :                :     RelFileLocatorBackend rlocator;
                               3980                 :                :     BlockNumber nForkBlock[MAX_FORKNUM];
 1068 tgl@sss.pgh.pa.us        3981                 :            578 :     uint64      nBlocksToInvalidate = 0;
                               3982                 :                : 
  648 rhaas@postgresql.org     3983                 :            578 :     rlocator = smgr_reln->smgr_rlocator;
                               3984                 :                : 
                               3985                 :                :     /* If it's a local relation, it's localbuf.c's problem. */
                               3986         [ +  + ]:            578 :     if (RelFileLocatorBackendIsTemp(rlocator))
                               3987                 :                :     {
   42 heikki.linnakangas@i     3988         [ +  - ]:GNC         329 :         if (rlocator.backend == MyProcNumber)
                               3989                 :                :         {
 1664 fujii@postgresql.org     3990         [ +  + ]:CBC         675 :             for (j = 0; j < nforks; j++)
  642 rhaas@postgresql.org     3991                 :            346 :                 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
                               3992                 :            346 :                                          firstDelBlock[j]);
                               3993                 :                :         }
 7922 tgl@sss.pgh.pa.us        3994                 :            371 :         return;
                               3995                 :                :     }
                               3996                 :                : 
                               3997                 :                :     /*
                               3998                 :                :      * To remove all the pages of the specified relation forks from the buffer
                               3999                 :                :      * pool, we need to scan the entire buffer pool but we can optimize it by
                               4000                 :                :      * finding the buffers from BufMapping table provided we know the exact
                               4001                 :                :      * size of each fork of the relation. The exact size is required to ensure
                               4002                 :                :      * that we don't leave any buffer for the relation being dropped as
                               4003                 :                :      * otherwise the background writer or checkpointer can lead to a PANIC
                               4004                 :                :      * error while flushing buffers corresponding to files that don't exist.
                               4005                 :                :      *
                               4006                 :                :      * To know the exact size, we rely on the size cached for each fork by us
                               4007                 :                :      * during recovery which limits the optimization to recovery and on
                               4008                 :                :      * standbys but we can easily extend it once we have shared cache for
                               4009                 :                :      * relation size.
                               4010                 :                :      *
                               4011                 :                :      * In recovery, we cache the value returned by the first lseek(SEEK_END)
                               4012                 :                :      * and the future writes keeps the cached value up-to-date. See
                               4013                 :                :      * smgrextend. It is possible that the value of the first lseek is smaller
                               4014                 :                :      * than the actual number of existing blocks in the file due to buggy
                               4015                 :                :      * Linux kernels that might not have accounted for the recent write. But
                               4016                 :                :      * that should be fine because there must not be any buffers after that
                               4017                 :                :      * file size.
                               4018                 :                :      */
 1188 akapila@postgresql.o     4019         [ +  + ]:            347 :     for (i = 0; i < nforks; i++)
                               4020                 :                :     {
                               4021                 :                :         /* Get the number of blocks for a relation's fork */
                               4022                 :            297 :         nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
                               4023                 :                : 
                               4024         [ +  + ]:            297 :         if (nForkBlock[i] == InvalidBlockNumber)
                               4025                 :                :         {
                               4026                 :            199 :             nBlocksToInvalidate = InvalidBlockNumber;
                               4027                 :            199 :             break;
                               4028                 :                :         }
                               4029                 :                : 
                               4030                 :                :         /* calculate the number of blocks to be invalidated */
                               4031                 :             98 :         nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
                               4032                 :                :     }
                               4033                 :                : 
                               4034                 :                :     /*
                               4035                 :                :      * We apply the optimization iff the total number of blocks to invalidate
                               4036                 :                :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
                               4037                 :                :      */
                               4038         [ +  + ]:            249 :     if (BlockNumberIsValid(nBlocksToInvalidate) &&
                               4039         [ +  + ]:             50 :         nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
                               4040                 :                :     {
                               4041         [ +  + ]:            121 :         for (j = 0; j < nforks; j++)
  642 rhaas@postgresql.org     4042                 :             79 :             FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
                               4043                 :             79 :                                        nForkBlock[j], firstDelBlock[j]);
 1188 akapila@postgresql.o     4044                 :             42 :         return;
                               4045                 :                :     }
                               4046                 :                : 
 6981 tgl@sss.pgh.pa.us        4047         [ +  + ]:        2692687 :     for (i = 0; i < NBuffers; i++)
                               4048                 :                :     {
 3072 rhaas@postgresql.org     4049                 :        2692480 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               4050                 :                :         uint32      buf_state;
                               4051                 :                : 
                               4052                 :                :         /*
                               4053                 :                :          * We can make this a tad faster by prechecking the buffer tag before
                               4054                 :                :          * we attempt to lock the buffer; this saves a lot of lock
                               4055                 :                :          * acquisitions in typical cases.  It should be safe because the
                               4056                 :                :          * caller must have AccessExclusiveLock on the relation, or some other
                               4057                 :                :          * reason to be certain that no one is loading new pages of the rel
                               4058                 :                :          * into the buffer pool.  (Otherwise we might well miss such pages
                               4059                 :                :          * entirely.)  Therefore, while the tag might be changing while we
                               4060                 :                :          * look at it, it can't be changing *to* a value we care about, only
                               4061                 :                :          * *away* from such a value.  So false negatives are impossible, and
                               4062                 :                :          * false positives are safe because we'll recheck after getting the
                               4063                 :                :          * buffer lock.
                               4064                 :                :          *
                               4065                 :                :          * We could check forkNum and blockNum as well as the rlocator, but
                               4066                 :                :          * the incremental win from doing so seems small.
                               4067                 :                :          */
  599                          4068         [ +  + ]:        2692480 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
 4329 tgl@sss.pgh.pa.us        4069                 :        2683660 :             continue;
                               4070                 :                : 
 2926 andres@anarazel.de       4071                 :           8820 :         buf_state = LockBufHdr(bufHdr);
                               4072                 :                : 
 1664 fujii@postgresql.org     4073         [ +  + ]:          21892 :         for (j = 0; j < nforks; j++)
                               4074                 :                :         {
  599 rhaas@postgresql.org     4075         [ +  - ]:          15493 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
                               4076         [ +  + ]:          15493 :                 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
 1664 fujii@postgresql.org     4077         [ +  + ]:           8709 :                 bufHdr->tag.blockNum >= firstDelBlock[j])
                               4078                 :                :             {
 1431 tgl@sss.pgh.pa.us        4079                 :           2421 :                 InvalidateBuffer(bufHdr);   /* releases spinlock */
 1664 fujii@postgresql.org     4080                 :           2421 :                 break;
                               4081                 :                :             }
                               4082                 :                :         }
                               4083         [ +  + ]:           8820 :         if (j >= nforks)
 2926 andres@anarazel.de       4084                 :           6399 :             UnlockBufHdr(bufHdr, buf_state);
                               4085                 :                :     }
                               4086                 :                : }
                               4087                 :                : 
                               4088                 :                : /* ---------------------------------------------------------------------
                               4089                 :                :  *      DropRelationsAllBuffers
                               4090                 :                :  *
                               4091                 :                :  *      This function removes from the buffer pool all the pages of all
                               4092                 :                :  *      forks of the specified relations.  It's equivalent to calling
                               4093                 :                :  *      DropRelationBuffers once per fork per relation with firstDelBlock = 0.
                               4094                 :                :  *      --------------------------------------------------------------------
                               4095                 :                :  */
                               4096                 :                : void
  642 rhaas@postgresql.org     4097                 :          12109 : DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
                               4098                 :                : {
                               4099                 :                :     int         i;
 1187 akapila@postgresql.o     4100                 :          12109 :     int         n = 0;
                               4101                 :                :     SMgrRelation *rels;
                               4102                 :                :     BlockNumber (*block)[MAX_FORKNUM + 1];
 1068 tgl@sss.pgh.pa.us        4103                 :          12109 :     uint64      nBlocksToInvalidate = 0;
                               4104                 :                :     RelFileLocator *locators;
 1187 akapila@postgresql.o     4105                 :          12109 :     bool        cached = true;
                               4106                 :                :     bool        use_bsearch;
                               4107                 :                : 
  648 rhaas@postgresql.org     4108         [ -  + ]:          12109 :     if (nlocators == 0)
 4105 alvherre@alvh.no-ip.     4109                 :UBC           0 :         return;
                               4110                 :                : 
  648 rhaas@postgresql.org     4111                 :CBC       12109 :     rels = palloc(sizeof(SMgrRelation) * nlocators);    /* non-local relations */
                               4112                 :                : 
                               4113                 :                :     /* If it's a local relation, it's localbuf.c's problem. */
                               4114         [ +  + ]:          54590 :     for (i = 0; i < nlocators; i++)
                               4115                 :                :     {
                               4116         [ +  + ]:          42481 :         if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
                               4117                 :                :         {
   42 heikki.linnakangas@i     4118         [ +  - ]:GNC        2903 :             if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
  642 rhaas@postgresql.org     4119                 :CBC        2903 :                 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
                               4120                 :                :         }
                               4121                 :                :         else
 1187 akapila@postgresql.o     4122                 :          39578 :             rels[n++] = smgr_reln[i];
                               4123                 :                :     }
                               4124                 :                : 
                               4125                 :                :     /*
                               4126                 :                :      * If there are no non-local relations, then we're done. Release the
                               4127                 :                :      * memory and return.
                               4128                 :                :      */
 4105 alvherre@alvh.no-ip.     4129         [ +  + ]:          12109 :     if (n == 0)
                               4130                 :                :     {
 1187 akapila@postgresql.o     4131                 :            733 :         pfree(rels);
 4329 tgl@sss.pgh.pa.us        4132                 :            733 :         return;
                               4133                 :                :     }
                               4134                 :                : 
                               4135                 :                :     /*
                               4136                 :                :      * This is used to remember the number of blocks for all the relations
                               4137                 :                :      * forks.
                               4138                 :                :      */
                               4139                 :                :     block = (BlockNumber (*)[MAX_FORKNUM + 1])
 1187 akapila@postgresql.o     4140                 :          11376 :         palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
                               4141                 :                : 
                               4142                 :                :     /*
                               4143                 :                :      * We can avoid scanning the entire buffer pool if we know the exact size
                               4144                 :                :      * of each of the given relation forks. See DropRelationBuffers.
                               4145                 :                :      */
                               4146   [ +  +  +  + ]:          24113 :     for (i = 0; i < n && cached; i++)
                               4147                 :                :     {
  599 drowley@postgresql.o     4148         [ +  + ]:          21393 :         for (int j = 0; j <= MAX_FORKNUM; j++)
                               4149                 :                :         {
                               4150                 :                :             /* Get the number of blocks for a relation's fork. */
 1187 akapila@postgresql.o     4151                 :          19240 :             block[i][j] = smgrnblocks_cached(rels[i], j);
                               4152                 :                : 
                               4153                 :                :             /* We need to only consider the relation forks that exists. */
                               4154         [ +  + ]:          19240 :             if (block[i][j] == InvalidBlockNumber)
                               4155                 :                :             {
                               4156         [ +  + ]:          16907 :                 if (!smgrexists(rels[i], j))
                               4157                 :           6323 :                     continue;
                               4158                 :          10584 :                 cached = false;
                               4159                 :          10584 :                 break;
                               4160                 :                :             }
                               4161                 :                : 
                               4162                 :                :             /* calculate the total number of blocks to be invalidated */
                               4163                 :           2333 :             nBlocksToInvalidate += block[i][j];
                               4164                 :                :         }
                               4165                 :                :     }
                               4166                 :                : 
                               4167                 :                :     /*
                               4168                 :                :      * We apply the optimization iff the total number of blocks to invalidate
                               4169                 :                :      * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
                               4170                 :                :      */
                               4171   [ +  +  +  + ]:          11376 :     if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
                               4172                 :                :     {
                               4173         [ +  + ]:           1268 :         for (i = 0; i < n; i++)
                               4174                 :                :         {
  599 drowley@postgresql.o     4175         [ +  + ]:           3510 :             for (int j = 0; j <= MAX_FORKNUM; j++)
                               4176                 :                :             {
                               4177                 :                :                 /* ignore relation forks that doesn't exist */
 1187 akapila@postgresql.o     4178         [ +  + ]:           2808 :                 if (!BlockNumberIsValid(block[i][j]))
                               4179                 :           2095 :                     continue;
                               4180                 :                : 
                               4181                 :                :                 /* drop all the buffers for a particular relation fork */
  642 rhaas@postgresql.org     4182                 :            713 :                 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
                               4183                 :            713 :                                            j, block[i][j], 0);
                               4184                 :                :             }
                               4185                 :                :         }
                               4186                 :                : 
 1187 akapila@postgresql.o     4187                 :            566 :         pfree(block);
                               4188                 :            566 :         pfree(rels);
                               4189                 :            566 :         return;
                               4190                 :                :     }
                               4191                 :                : 
                               4192                 :          10810 :     pfree(block);
  648 rhaas@postgresql.org     4193                 :          10810 :     locators = palloc(sizeof(RelFileLocator) * n);  /* non-local relations */
 1187 akapila@postgresql.o     4194         [ +  + ]:          49686 :     for (i = 0; i < n; i++)
  648 rhaas@postgresql.org     4195                 :          38876 :         locators[i] = rels[i]->smgr_rlocator.locator;
                               4196                 :                : 
                               4197                 :                :     /*
                               4198                 :                :      * For low number of relations to drop just use a simple walk through, to
                               4199                 :                :      * save the bsearch overhead. The threshold to use is rather a guess than
                               4200                 :                :      * an exactly determined value, as it depends on many factors (CPU and RAM
                               4201                 :                :      * speeds, amount of shared buffers etc.).
                               4202                 :                :      */
 1471 noah@leadboat.com        4203                 :          10810 :     use_bsearch = n > RELS_BSEARCH_THRESHOLD;
                               4204                 :                : 
                               4205                 :                :     /* sort the list of rlocators if necessary */
 4105 alvherre@alvh.no-ip.     4206         [ +  + ]:          10810 :     if (use_bsearch)
   58 nathan@postgresql.or     4207                 :GNC         165 :         qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
                               4208                 :                : 
 4329 tgl@sss.pgh.pa.us        4209         [ +  + ]:CBC   118047546 :     for (i = 0; i < NBuffers; i++)
                               4210                 :                :     {
  648 rhaas@postgresql.org     4211                 :      118036736 :         RelFileLocator *rlocator = NULL;
 3072                          4212                 :      118036736 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               4213                 :                :         uint32      buf_state;
                               4214                 :                : 
                               4215                 :                :         /*
                               4216                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               4217                 :                :          * saves some cycles.
                               4218                 :                :          */
                               4219                 :                : 
 4105 alvherre@alvh.no-ip.     4220         [ +  + ]:      118036736 :         if (!use_bsearch)
                               4221                 :                :         {
                               4222                 :                :             int         j;
                               4223                 :                : 
                               4224         [ +  + ]:      483404685 :             for (j = 0; j < n; j++)
                               4225                 :                :             {
  599 rhaas@postgresql.org     4226         [ +  + ]:      367176016 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
                               4227                 :                :                 {
  648                          4228                 :          80067 :                     rlocator = &locators[j];
 4105 alvherre@alvh.no-ip.     4229                 :          80067 :                     break;
                               4230                 :                :                 }
                               4231                 :                :             }
                               4232                 :                :         }
                               4233                 :                :         else
                               4234                 :                :         {
                               4235                 :                :             RelFileLocator locator;
                               4236                 :                : 
  599 rhaas@postgresql.org     4237                 :        1728000 :             locator = BufTagGetRelFileLocator(&bufHdr->tag);
                               4238                 :        1728000 :             rlocator = bsearch((const void *) &(locator),
                               4239                 :                :                                locators, n, sizeof(RelFileLocator),
                               4240                 :                :                                rlocator_comparator);
                               4241                 :                :         }
                               4242                 :                : 
                               4243                 :                :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
  648                          4244         [ +  + ]:      118036736 :         if (rlocator == NULL)
 4329 tgl@sss.pgh.pa.us        4245                 :      117955010 :             continue;
                               4246                 :                : 
 2926 andres@anarazel.de       4247                 :          81726 :         buf_state = LockBufHdr(bufHdr);
  599 rhaas@postgresql.org     4248         [ +  - ]:          81726 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
 4329 tgl@sss.pgh.pa.us        4249                 :          81726 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               4250                 :                :         else
 2926 andres@anarazel.de       4251                 :UBC           0 :             UnlockBufHdr(bufHdr, buf_state);
                               4252                 :                :     }
                               4253                 :                : 
  648 rhaas@postgresql.org     4254                 :CBC       10810 :     pfree(locators);
 1187 akapila@postgresql.o     4255                 :          10810 :     pfree(rels);
                               4256                 :                : }
                               4257                 :                : 
                               4258                 :                : /* ---------------------------------------------------------------------
                               4259                 :                :  *      FindAndDropRelationBuffers
                               4260                 :                :  *
                               4261                 :                :  *      This function performs look up in BufMapping table and removes from the
                               4262                 :                :  *      buffer pool all the pages of the specified relation fork that has block
                               4263                 :                :  *      number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
                               4264                 :                :  *      pages are removed.)
                               4265                 :                :  * --------------------------------------------------------------------
                               4266                 :                :  */
                               4267                 :                : static void
  642 rhaas@postgresql.org     4268                 :            792 : FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
                               4269                 :                :                            BlockNumber nForkBlock,
                               4270                 :                :                            BlockNumber firstDelBlock)
                               4271                 :                : {
                               4272                 :                :     BlockNumber curBlock;
                               4273                 :                : 
 1188 akapila@postgresql.o     4274         [ +  + ]:           1922 :     for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
                               4275                 :                :     {
                               4276                 :                :         uint32      bufHash;    /* hash value for tag */
                               4277                 :                :         BufferTag   bufTag;     /* identity of requested block */
                               4278                 :                :         LWLock     *bufPartitionLock;   /* buffer partition lock for it */
                               4279                 :                :         int         buf_id;
                               4280                 :                :         BufferDesc *bufHdr;
                               4281                 :                :         uint32      buf_state;
                               4282                 :                : 
                               4283                 :                :         /* create a tag so we can lookup the buffer */
  627 rhaas@postgresql.org     4284                 :           1130 :         InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
                               4285                 :                : 
                               4286                 :                :         /* determine its hash code and partition lock ID */
 1188 akapila@postgresql.o     4287                 :           1130 :         bufHash = BufTableHashCode(&bufTag);
                               4288                 :           1130 :         bufPartitionLock = BufMappingPartitionLock(bufHash);
                               4289                 :                : 
                               4290                 :                :         /* Check that it is in the buffer pool. If not, do nothing. */
                               4291                 :           1130 :         LWLockAcquire(bufPartitionLock, LW_SHARED);
                               4292                 :           1130 :         buf_id = BufTableLookup(&bufTag, bufHash);
                               4293                 :           1130 :         LWLockRelease(bufPartitionLock);
                               4294                 :                : 
                               4295         [ +  + ]:           1130 :         if (buf_id < 0)
                               4296                 :            151 :             continue;
                               4297                 :                : 
                               4298                 :            979 :         bufHdr = GetBufferDescriptor(buf_id);
                               4299                 :                : 
                               4300                 :                :         /*
                               4301                 :                :          * We need to lock the buffer header and recheck if the buffer is
                               4302                 :                :          * still associated with the same block because the buffer could be
                               4303                 :                :          * evicted by some other backend loading blocks for a different
                               4304                 :                :          * relation after we release lock on the BufMapping table.
                               4305                 :                :          */
                               4306                 :            979 :         buf_state = LockBufHdr(bufHdr);
                               4307                 :                : 
  599 rhaas@postgresql.org     4308   [ +  -  +  - ]:           1958 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
                               4309                 :            979 :             BufTagGetForkNum(&bufHdr->tag) == forkNum &&
 1188 akapila@postgresql.o     4310         [ +  - ]:            979 :             bufHdr->tag.blockNum >= firstDelBlock)
                               4311                 :            979 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               4312                 :                :         else
 1188 akapila@postgresql.o     4313                 :UBC           0 :             UnlockBufHdr(bufHdr, buf_state);
                               4314                 :                :     }
 1188 akapila@postgresql.o     4315                 :CBC         792 : }
                               4316                 :                : 
                               4317                 :                : /* ---------------------------------------------------------------------
                               4318                 :                :  *      DropDatabaseBuffers
                               4319                 :                :  *
                               4320                 :                :  *      This function removes all the buffers in the buffer cache for a
                               4321                 :                :  *      particular database.  Dirty pages are simply dropped, without
                               4322                 :                :  *      bothering to write them out first.  This is used when we destroy a
                               4323                 :                :  *      database, to avoid trying to flush data to disk when the directory
                               4324                 :                :  *      tree no longer exists.  Implementation is pretty similar to
                               4325                 :                :  *      DropRelationBuffers() which is for destroying just one relation.
                               4326                 :                :  * --------------------------------------------------------------------
                               4327                 :                :  */
                               4328                 :                : void
 6591 tgl@sss.pgh.pa.us        4329                 :             64 : DropDatabaseBuffers(Oid dbid)
                               4330                 :                : {
                               4331                 :                :     int         i;
                               4332                 :                : 
                               4333                 :                :     /*
                               4334                 :                :      * We needn't consider local buffers, since by assumption the target
                               4335                 :                :      * database isn't our own.
                               4336                 :                :      */
                               4337                 :                : 
 6981                          4338         [ +  + ]:         203328 :     for (i = 0; i < NBuffers; i++)
                               4339                 :                :     {
 3072 rhaas@postgresql.org     4340                 :         203264 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               4341                 :                :         uint32      buf_state;
                               4342                 :                : 
                               4343                 :                :         /*
                               4344                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               4345                 :                :          * saves some cycles.
                               4346                 :                :          */
  599                          4347         [ +  + ]:         203264 :         if (bufHdr->tag.dbOid != dbid)
 4329 tgl@sss.pgh.pa.us        4348                 :         194782 :             continue;
                               4349                 :                : 
 2926 andres@anarazel.de       4350                 :           8482 :         buf_state = LockBufHdr(bufHdr);
  599 rhaas@postgresql.org     4351         [ +  - ]:           8482 :         if (bufHdr->tag.dbOid == dbid)
 6756 bruce@momjian.us         4352                 :           8482 :             InvalidateBuffer(bufHdr);   /* releases spinlock */
                               4353                 :                :         else
 2926 andres@anarazel.de       4354                 :UBC           0 :             UnlockBufHdr(bufHdr, buf_state);
                               4355                 :                :     }
10141 scrappy@hub.org          4356                 :CBC          64 : }
                               4357                 :                : 
                               4358                 :                : /* -----------------------------------------------------------------
                               4359                 :                :  *      PrintBufferDescs
                               4360                 :                :  *
                               4361                 :                :  *      this function prints all the buffer descriptors, for debugging
                               4362                 :                :  *      use only.
                               4363                 :                :  * -----------------------------------------------------------------
                               4364                 :                :  */
                               4365                 :                : #ifdef NOT_USED
                               4366                 :                : void
                               4367                 :                : PrintBufferDescs(void)
                               4368                 :                : {
                               4369                 :                :     int         i;
                               4370                 :                : 
                               4371                 :                :     for (i = 0; i < NBuffers; ++i)
                               4372                 :                :     {
                               4373                 :                :         BufferDesc *buf = GetBufferDescriptor(i);
                               4374                 :                :         Buffer      b = BufferDescriptorGetBuffer(buf);
                               4375                 :                : 
                               4376                 :                :         /* theoretically we should lock the bufhdr here */
                               4377                 :                :         elog(LOG,
                               4378                 :                :              "[%02d] (freeNext=%d, rel=%s, "
                               4379                 :                :              "blockNum=%u, flags=0x%x, refcount=%u %d)",
                               4380                 :                :              i, buf->freeNext,
                               4381                 :                :              relpathbackend(BufTagGetRelFileLocator(&buf->tag),
                               4382                 :                :                             INVALID_PROC_NUMBER, BufTagGetForkNum(&buf->tag)),
                               4383                 :                :              buf->tag.blockNum, buf->flags,
                               4384                 :                :              buf->refcount, GetPrivateRefCount(b));
                               4385                 :                :     }
                               4386                 :                : }
                               4387                 :                : #endif
                               4388                 :                : 
                               4389                 :                : #ifdef NOT_USED
                               4390                 :                : void
                               4391                 :                : PrintPinnedBufs(void)
                               4392                 :                : {
                               4393                 :                :     int         i;
                               4394                 :                : 
                               4395                 :                :     for (i = 0; i < NBuffers; ++i)
                               4396                 :                :     {
                               4397                 :                :         BufferDesc *buf = GetBufferDescriptor(i);
                               4398                 :                :         Buffer      b = BufferDescriptorGetBuffer(buf);
                               4399                 :                : 
                               4400                 :                :         if (GetPrivateRefCount(b) > 0)
                               4401                 :                :         {
                               4402                 :                :             /* theoretically we should lock the bufhdr here */
                               4403                 :                :             elog(LOG,
                               4404                 :                :                  "[%02d] (freeNext=%d, rel=%s, "
                               4405                 :                :                  "blockNum=%u, flags=0x%x, refcount=%u %d)",
                               4406                 :                :                  i, buf->freeNext,
                               4407                 :                :                  relpathperm(BufTagGetRelFileLocator(&buf->tag),
                               4408                 :                :                              BufTagGetForkNum(&buf->tag)),
                               4409                 :                :                  buf->tag.blockNum, buf->flags,
                               4410                 :                :                  buf->refcount, GetPrivateRefCount(b));
                               4411                 :                :         }
                               4412                 :                :     }
                               4413                 :                : }
                               4414                 :                : #endif
                               4415                 :                : 
                               4416                 :                : /* ---------------------------------------------------------------------
                               4417                 :                :  *      FlushRelationBuffers
                               4418                 :                :  *
                               4419                 :                :  *      This function writes all dirty pages of a relation out to disk
                               4420                 :                :  *      (or more accurately, out to kernel disk buffers), ensuring that the
                               4421                 :                :  *      kernel has an up-to-date view of the relation.
                               4422                 :                :  *
                               4423                 :                :  *      Generally, the caller should be holding AccessExclusiveLock on the
                               4424                 :                :  *      target relation to ensure that no other backend is busy dirtying
                               4425                 :                :  *      more blocks of the relation; the effects can't be expected to last
                               4426                 :                :  *      after the lock is released.
                               4427                 :                :  *
                               4428                 :                :  *      XXX currently it sequentially searches the buffer pool, should be
                               4429                 :                :  *      changed to more clever ways of searching.  This routine is not
                               4430                 :                :  *      used in any performance-critical code paths, so it's not worth
                               4431                 :                :  *      adding additional overhead to normal paths to make it go faster.
                               4432                 :                :  * --------------------------------------------------------------------
                               4433                 :                :  */
                               4434                 :                : void
 6965 tgl@sss.pgh.pa.us        4435                 :            130 : FlushRelationBuffers(Relation rel)
                               4436                 :                : {
                               4437                 :                :     int         i;
                               4438                 :                :     BufferDesc *bufHdr;
   74 heikki.linnakangas@i     4439                 :GNC         130 :     SMgrRelation srel = RelationGetSmgr(rel);
                               4440                 :                : 
 4871 rhaas@postgresql.org     4441         [ +  + ]:CBC         130 :     if (RelationUsesLocalBuffers(rel))
                               4442                 :                :     {
 9701 vadim4o@yahoo.com        4443         [ +  + ]:            909 :         for (i = 0; i < NLocBuffer; i++)
                               4444                 :                :         {
                               4445                 :                :             uint32      buf_state;
                               4446                 :                :             instr_time  io_start;
                               4447                 :                : 
 3363 andres@anarazel.de       4448                 :            900 :             bufHdr = GetLocalBufferDescriptor(i);
  599 rhaas@postgresql.org     4449         [ +  + ]:            900 :             if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
 2926 andres@anarazel.de       4450         [ +  + ]:            300 :                 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
                               4451                 :                :                  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               4452                 :                :             {
                               4453                 :                :                 ErrorContextCallback errcallback;
                               4454                 :                :                 Page        localpage;
                               4455                 :                : 
 4041 simon@2ndQuadrant.co     4456                 :            297 :                 localpage = (char *) LocalBufHdrGetBlock(bufHdr);
                               4457                 :                : 
                               4458                 :                :                 /* Setup error traceback support for ereport() */
 4171 heikki.linnakangas@i     4459                 :            297 :                 errcallback.callback = local_buffer_write_error_callback;
                               4460                 :            297 :                 errcallback.arg = (void *) bufHdr;
                               4461                 :            297 :                 errcallback.previous = error_context_stack;
                               4462                 :            297 :                 error_context_stack = &errcallback;
                               4463                 :                : 
 4041 simon@2ndQuadrant.co     4464                 :            297 :                 PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
                               4465                 :                : 
  120 michael@paquier.xyz      4466                 :GNC         297 :                 io_start = pgstat_prepare_io_time(track_io_timing);
                               4467                 :                : 
   74 heikki.linnakangas@i     4468                 :            297 :                 smgrwrite(srel,
  599 rhaas@postgresql.org     4469                 :CBC         297 :                           BufTagGetForkNum(&bufHdr->tag),
                               4470                 :                :                           bufHdr->tag.blockNum,
                               4471                 :                :                           localpage,
                               4472                 :                :                           false);
                               4473                 :                : 
  373 andres@anarazel.de       4474                 :            297 :                 pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
                               4475                 :                :                                         IOCONTEXT_NORMAL, IOOP_WRITE,
                               4476                 :                :                                         io_start, 1);
                               4477                 :                : 
 2926                          4478                 :            297 :                 buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 2746                          4479                 :            297 :                 pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
                               4480                 :                : 
  373                          4481                 :            297 :                 pgBufferUsage.local_blks_written++;
                               4482                 :                : 
                               4483                 :                :                 /* Pop the error context stack */
 4171 heikki.linnakangas@i     4484                 :            297 :                 error_context_stack = errcallback.previous;
                               4485                 :                :             }
                               4486                 :                :         }
                               4487                 :                : 
 7258 tgl@sss.pgh.pa.us        4488                 :              9 :         return;
                               4489                 :                :     }
                               4490                 :                : 
 9701 vadim4o@yahoo.com        4491         [ +  + ]:        1429881 :     for (i = 0; i < NBuffers; i++)
                               4492                 :                :     {
                               4493                 :                :         uint32      buf_state;
                               4494                 :                : 
 3363 andres@anarazel.de       4495                 :        1429760 :         bufHdr = GetBufferDescriptor(i);
                               4496                 :                : 
                               4497                 :                :         /*
                               4498                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               4499                 :                :          * saves some cycles.
                               4500                 :                :          */
  599 rhaas@postgresql.org     4501         [ +  + ]:        1429760 :         if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
 4329 tgl@sss.pgh.pa.us        4502                 :        1429555 :             continue;
                               4503                 :                : 
                               4504                 :                :         /* Make sure we can handle the pin */
 3373 andres@anarazel.de       4505                 :            205 :         ReservePrivateRefCountEntry();
  158 heikki.linnakangas@i     4506                 :GNC         205 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               4507                 :                : 
 2926 andres@anarazel.de       4508                 :CBC         205 :         buf_state = LockBufHdr(bufHdr);
  599 rhaas@postgresql.org     4509         [ +  - ]:            205 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
 2926 andres@anarazel.de       4510         [ +  + ]:            205 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               4511                 :                :         {
 6965 tgl@sss.pgh.pa.us        4512                 :            163 :             PinBuffer_Locked(bufHdr);
 3043 rhaas@postgresql.org     4513                 :            163 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
   74 heikki.linnakangas@i     4514                 :GNC         163 :             FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 3043 rhaas@postgresql.org     4515                 :CBC         163 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
  562 michael@paquier.xyz      4516                 :            163 :             UnpinBuffer(bufHdr);
                               4517                 :                :         }
                               4518                 :                :         else
 2926 andres@anarazel.de       4519                 :             42 :             UnlockBufHdr(bufHdr, buf_state);
                               4520                 :                :     }
                               4521                 :                : }
                               4522                 :                : 
                               4523                 :                : /* ---------------------------------------------------------------------
                               4524                 :                :  *      FlushRelationsAllBuffers
                               4525                 :                :  *
                               4526                 :                :  *      This function flushes out of the buffer pool all the pages of all
                               4527                 :                :  *      forks of the specified smgr relations.  It's equivalent to calling
                               4528                 :                :  *      FlushRelationBuffers once per relation.  The relations are assumed not
                               4529                 :                :  *      to use local buffers.
                               4530                 :                :  * --------------------------------------------------------------------
                               4531                 :                :  */
                               4532                 :                : void
 1471 noah@leadboat.com        4533                 :              9 : FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
                               4534                 :                : {
                               4535                 :                :     int         i;
                               4536                 :                :     SMgrSortArray *srels;
                               4537                 :                :     bool        use_bsearch;
                               4538                 :                : 
                               4539         [ -  + ]:              9 :     if (nrels == 0)
 1471 noah@leadboat.com        4540                 :UBC           0 :         return;
                               4541                 :                : 
                               4542                 :                :     /* fill-in array for qsort */
 1471 noah@leadboat.com        4543                 :CBC           9 :     srels = palloc(sizeof(SMgrSortArray) * nrels);
                               4544                 :                : 
                               4545         [ +  + ]:             18 :     for (i = 0; i < nrels; i++)
                               4546                 :                :     {
  648 rhaas@postgresql.org     4547         [ -  + ]:              9 :         Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
                               4548                 :                : 
                               4549                 :              9 :         srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
 1471 noah@leadboat.com        4550                 :              9 :         srels[i].srel = smgrs[i];
                               4551                 :                :     }
                               4552                 :                : 
                               4553                 :                :     /*
                               4554                 :                :      * Save the bsearch overhead for low number of relations to sync. See
                               4555                 :                :      * DropRelationsAllBuffers for details.
                               4556                 :                :      */
                               4557                 :              9 :     use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
                               4558                 :                : 
                               4559                 :                :     /* sort the list of SMgrRelations if necessary */
                               4560         [ -  + ]:              9 :     if (use_bsearch)
   58 nathan@postgresql.or     4561                 :UNC           0 :         qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
                               4562                 :                : 
 1471 noah@leadboat.com        4563         [ +  + ]:CBC      147465 :     for (i = 0; i < NBuffers; i++)
                               4564                 :                :     {
                               4565                 :         147456 :         SMgrSortArray *srelent = NULL;
                               4566                 :         147456 :         BufferDesc *bufHdr = GetBufferDescriptor(i);
                               4567                 :                :         uint32      buf_state;
                               4568                 :                : 
                               4569                 :                :         /*
                               4570                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               4571                 :                :          * saves some cycles.
                               4572                 :                :          */
                               4573                 :                : 
                               4574         [ +  - ]:         147456 :         if (!use_bsearch)
                               4575                 :                :         {
                               4576                 :                :             int         j;
                               4577                 :                : 
                               4578         [ +  + ]:         291094 :             for (j = 0; j < nrels; j++)
                               4579                 :                :             {
  599 rhaas@postgresql.org     4580         [ +  + ]:         147456 :                 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
                               4581                 :                :                 {
 1471 noah@leadboat.com        4582                 :           3818 :                     srelent = &srels[j];
                               4583                 :           3818 :                     break;
                               4584                 :                :                 }
                               4585                 :                :             }
                               4586                 :                :         }
                               4587                 :                :         else
                               4588                 :                :         {
                               4589                 :                :             RelFileLocator rlocator;
                               4590                 :                : 
  599 rhaas@postgresql.org     4591                 :UBC           0 :             rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
                               4592                 :              0 :             srelent = bsearch((const void *) &(rlocator),
                               4593                 :                :                               srels, nrels, sizeof(SMgrSortArray),
                               4594                 :                :                               rlocator_comparator);
                               4595                 :                :         }
                               4596                 :                : 
                               4597                 :                :         /* buffer doesn't belong to any of the given relfilelocators; skip it */
 1471 noah@leadboat.com        4598         [ +  + ]:CBC      147456 :         if (srelent == NULL)
                               4599                 :         143638 :             continue;
                               4600                 :                : 
                               4601                 :                :         /* Make sure we can handle the pin */
                               4602                 :           3818 :         ReservePrivateRefCountEntry();
  158 heikki.linnakangas@i     4603                 :GNC        3818 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               4604                 :                : 
 1471 noah@leadboat.com        4605                 :CBC        3818 :         buf_state = LockBufHdr(bufHdr);
  599 rhaas@postgresql.org     4606         [ +  - ]:           3818 :         if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
 1471 noah@leadboat.com        4607         [ +  + ]:           3818 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               4608                 :                :         {
                               4609                 :           3367 :             PinBuffer_Locked(bufHdr);
                               4610                 :           3367 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
  430 andres@anarazel.de       4611                 :           3367 :             FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 1471 noah@leadboat.com        4612                 :           3367 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
  562 michael@paquier.xyz      4613                 :           3367 :             UnpinBuffer(bufHdr);
                               4614                 :                :         }
                               4615                 :                :         else
 1471 noah@leadboat.com        4616                 :            451 :             UnlockBufHdr(bufHdr, buf_state);
                               4617                 :                :     }
                               4618                 :                : 
                               4619                 :              9 :     pfree(srels);
                               4620                 :                : }
                               4621                 :                : 
                               4622                 :                : /* ---------------------------------------------------------------------
                               4623                 :                :  *      RelationCopyStorageUsingBuffer
                               4624                 :                :  *
                               4625                 :                :  *      Copy fork's data using bufmgr.  Same as RelationCopyStorage but instead
                               4626                 :                :  *      of using smgrread and smgrextend this will copy using bufmgr APIs.
                               4627                 :                :  *
                               4628                 :                :  *      Refer comments atop CreateAndCopyRelationData() for details about
                               4629                 :                :  *      'permanent' parameter.
                               4630                 :                :  * --------------------------------------------------------------------
                               4631                 :                :  */
                               4632                 :                : static void
  611 rhaas@postgresql.org     4633                 :          64249 : RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
                               4634                 :                :                                RelFileLocator dstlocator,
                               4635                 :                :                                ForkNumber forkNum, bool permanent)
                               4636                 :                : {
                               4637                 :                :     Buffer      srcBuf;
                               4638                 :                :     Buffer      dstBuf;
                               4639                 :                :     Page        srcPage;
                               4640                 :                :     Page        dstPage;
                               4641                 :                :     bool        use_wal;
                               4642                 :                :     BlockNumber nblocks;
                               4643                 :                :     BlockNumber blkno;
                               4644                 :                :     PGIOAlignedBlock buf;
                               4645                 :                :     BufferAccessStrategy bstrategy_src;
                               4646                 :                :     BufferAccessStrategy bstrategy_dst;
                               4647                 :                : 
                               4648                 :                :     /*
                               4649                 :                :      * In general, we want to write WAL whenever wal_level > 'minimal', but we
                               4650                 :                :      * can skip it when copying any fork of an unlogged relation other than
                               4651                 :                :      * the init fork.
                               4652                 :                :      */
  747                          4653   [ +  +  -  +  :          64249 :     use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
                                              -  - ]
                               4654                 :                : 
                               4655                 :                :     /* Get number of blocks in the source relation. */
   42 heikki.linnakangas@i     4656                 :GNC       64249 :     nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
                               4657                 :                :                           forkNum);
                               4658                 :                : 
                               4659                 :                :     /* Nothing to copy; just return. */
  747 rhaas@postgresql.org     4660         [ +  + ]:CBC       64249 :     if (nblocks == 0)
                               4661                 :          11066 :         return;
                               4662                 :                : 
                               4663                 :                :     /*
                               4664                 :                :      * Bulk extend the destination relation of the same size as the source
                               4665                 :                :      * relation before starting to copy block by block.
                               4666                 :                :      */
  605                          4667                 :          53183 :     memset(buf.data, 0, BLCKSZ);
   42 heikki.linnakangas@i     4668                 :GNC       53183 :     smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
                               4669                 :                :                buf.data, true);
                               4670                 :                : 
                               4671                 :                :     /* This is a bulk operation, so use buffer access strategies. */
  747 rhaas@postgresql.org     4672                 :CBC       53183 :     bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
                               4673                 :          53183 :     bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
                               4674                 :                : 
                               4675                 :                :     /* Iterate over each block of the source relation file. */
                               4676         [ +  + ]:         256198 :     for (blkno = 0; blkno < nblocks; blkno++)
                               4677                 :                :     {
                               4678         [ -  + ]:         203015 :         CHECK_FOR_INTERRUPTS();
                               4679                 :                : 
                               4680                 :                :         /* Read block from source relation. */
  611                          4681                 :         203015 :         srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
                               4682                 :                :                                            RBM_NORMAL, bstrategy_src,
                               4683                 :                :                                            permanent);
  617 tgl@sss.pgh.pa.us        4684                 :         203015 :         LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
  747 rhaas@postgresql.org     4685                 :         203015 :         srcPage = BufferGetPage(srcBuf);
                               4686                 :                : 
  605                          4687                 :         203015 :         dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
                               4688                 :                :                                            RBM_ZERO_AND_LOCK, bstrategy_dst,
                               4689                 :                :                                            permanent);
  617 tgl@sss.pgh.pa.us        4690                 :         203015 :         dstPage = BufferGetPage(dstBuf);
                               4691                 :                : 
  747 rhaas@postgresql.org     4692                 :         203015 :         START_CRIT_SECTION();
                               4693                 :                : 
                               4694                 :                :         /* Copy page data from the source to the destination. */
                               4695                 :         203015 :         memcpy(dstPage, srcPage, BLCKSZ);
                               4696                 :         203015 :         MarkBufferDirty(dstBuf);
                               4697                 :                : 
                               4698                 :                :         /* WAL-log the copied page. */
                               4699         [ +  + ]:         203015 :         if (use_wal)
                               4700                 :         114168 :             log_newpage_buffer(dstBuf, true);
                               4701                 :                : 
                               4702         [ -  + ]:         203015 :         END_CRIT_SECTION();
                               4703                 :                : 
                               4704                 :         203015 :         UnlockReleaseBuffer(dstBuf);
  617 tgl@sss.pgh.pa.us        4705                 :         203015 :         UnlockReleaseBuffer(srcBuf);
                               4706                 :                :     }
                               4707                 :                : 
  391 andres@anarazel.de       4708                 :          53183 :     FreeAccessStrategy(bstrategy_src);
                               4709                 :          53183 :     FreeAccessStrategy(bstrategy_dst);
                               4710                 :                : }
                               4711                 :                : 
                               4712                 :                : /* ---------------------------------------------------------------------
                               4713                 :                :  *      CreateAndCopyRelationData
                               4714                 :                :  *
                               4715                 :                :  *      Create destination relation storage and copy all forks from the
                               4716                 :                :  *      source relation to the destination.
                               4717                 :                :  *
                               4718                 :                :  *      Pass permanent as true for permanent relations and false for
                               4719                 :                :  *      unlogged relations.  Currently this API is not supported for
                               4720                 :                :  *      temporary relations.
                               4721                 :                :  * --------------------------------------------------------------------
                               4722                 :                :  */
                               4723                 :                : void
  648 rhaas@postgresql.org     4724                 :          48189 : CreateAndCopyRelationData(RelFileLocator src_rlocator,
                               4725                 :                :                           RelFileLocator dst_rlocator, bool permanent)
                               4726                 :                : {
                               4727                 :                :     char        relpersistence;
                               4728                 :                :     SMgrRelation src_rel;
                               4729                 :                :     SMgrRelation dst_rel;
                               4730                 :                : 
                               4731                 :                :     /* Set the relpersistence. */
  747                          4732         [ +  - ]:          48189 :     relpersistence = permanent ?
                               4733                 :                :         RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
                               4734                 :                : 
   42 heikki.linnakangas@i     4735                 :GNC       48189 :     src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
                               4736                 :          48189 :     dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
                               4737                 :                : 
                               4738                 :                :     /*
                               4739                 :                :      * Create and copy all forks of the relation.  During create database we
                               4740                 :                :      * have a separate cleanup mechanism which deletes complete database
                               4741                 :                :      * directory.  Therefore, each individual relation doesn't need to be
                               4742                 :                :      * registered for cleanup.
                               4743                 :                :      */
  648 rhaas@postgresql.org     4744                 :CBC       48189 :     RelationCreateStorage(dst_rlocator, relpersistence, false);
                               4745                 :                : 
                               4746                 :                :     /* copy main fork. */
  611                          4747                 :          48189 :     RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
                               4748                 :                :                                    permanent);
                               4749                 :                : 
                               4750                 :                :     /* copy those extra forks that exist */
  747                          4751                 :          48189 :     for (ForkNumber forkNum = MAIN_FORKNUM + 1;
                               4752         [ +  + ]:         192756 :          forkNum <= MAX_FORKNUM; forkNum++)
                               4753                 :                :     {
   74 heikki.linnakangas@i     4754         [ +  + ]:GNC      144567 :         if (smgrexists(src_rel, forkNum))
                               4755                 :                :         {
                               4756                 :          16060 :             smgrcreate(dst_rel, forkNum, false);
                               4757                 :                : 
                               4758                 :                :             /*
                               4759                 :                :              * WAL log creation if the relation is persistent, or this is the
                               4760                 :                :              * init fork of an unlogged relation.
                               4761                 :                :              */
  747 rhaas@postgresql.org     4762   [ -  +  -  - ]:CBC       16060 :             if (permanent || forkNum == INIT_FORKNUM)
  648                          4763                 :          16060 :                 log_smgrcreate(&dst_rlocator, forkNum);
                               4764                 :                : 
                               4765                 :                :             /* Copy a fork's data, block by block. */
  611                          4766                 :          16060 :             RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
                               4767                 :                :                                            permanent);
                               4768                 :                :         }
                               4769                 :                :     }
  747                          4770                 :          48189 : }
                               4771                 :                : 
                               4772                 :                : /* ---------------------------------------------------------------------
                               4773                 :                :  *      FlushDatabaseBuffers
                               4774                 :                :  *
                               4775                 :                :  *      This function writes all dirty pages of a database out to disk
                               4776                 :                :  *      (or more accurately, out to kernel disk buffers), ensuring that the
                               4777                 :                :  *      kernel has an up-to-date view of the database.
                               4778                 :                :  *
                               4779                 :                :  *      Generally, the caller should be holding an appropriate lock to ensure
                               4780                 :                :  *      no other backend is active in the target database; otherwise more
                               4781                 :                :  *      pages could get dirtied.
                               4782                 :                :  *
                               4783                 :                :  *      Note we don't worry about flushing any pages of temporary relations.
                               4784                 :                :  *      It's assumed these wouldn't be interesting.
                               4785                 :                :  * --------------------------------------------------------------------
                               4786                 :                :  */
                               4787                 :                : void
 6135 tgl@sss.pgh.pa.us        4788                 :             16 : FlushDatabaseBuffers(Oid dbid)
                               4789                 :                : {
                               4790                 :                :     int         i;
                               4791                 :                :     BufferDesc *bufHdr;
                               4792                 :                : 
                               4793         [ +  + ]:           2064 :     for (i = 0; i < NBuffers; i++)
                               4794                 :                :     {
                               4795                 :                :         uint32      buf_state;
                               4796                 :                : 
 3363 andres@anarazel.de       4797                 :           2048 :         bufHdr = GetBufferDescriptor(i);
                               4798                 :                : 
                               4799                 :                :         /*
                               4800                 :                :          * As in DropRelationBuffers, an unlocked precheck should be safe and
                               4801                 :                :          * saves some cycles.
                               4802                 :                :          */
  599 rhaas@postgresql.org     4803         [ +  + ]:           2048 :         if (bufHdr->tag.dbOid != dbid)
 4329 tgl@sss.pgh.pa.us        4804                 :           1668 :             continue;
                               4805                 :                : 
                               4806                 :                :         /* Make sure we can handle the pin */
 3373 andres@anarazel.de       4807                 :            380 :         ReservePrivateRefCountEntry();
  158 heikki.linnakangas@i     4808                 :GNC         380 :         ResourceOwnerEnlarge(CurrentResourceOwner);
                               4809                 :                : 
 2926 andres@anarazel.de       4810                 :CBC         380 :         buf_state = LockBufHdr(bufHdr);
  599 rhaas@postgresql.org     4811         [ +  - ]:            380 :         if (bufHdr->tag.dbOid == dbid &&
 2926 andres@anarazel.de       4812         [ +  + ]:            380 :             (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
                               4813                 :                :         {
 6135 tgl@sss.pgh.pa.us        4814                 :            288 :             PinBuffer_Locked(bufHdr);
 3043 rhaas@postgresql.org     4815                 :            288 :             LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
  430 andres@anarazel.de       4816                 :            288 :             FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 3043 rhaas@postgresql.org     4817                 :            288 :             LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
  562 michael@paquier.xyz      4818                 :            288 :             UnpinBuffer(bufHdr);
                               4819                 :                :         }
                               4820                 :                :         else
 2926 andres@anarazel.de       4821                 :             92 :             UnlockBufHdr(bufHdr, buf_state);
                               4822                 :                :     }
 6135 tgl@sss.pgh.pa.us        4823                 :             16 : }
                               4824                 :                : 
                               4825                 :                : /*
                               4826                 :                :  * Flush a previously, shared or exclusively, locked and pinned buffer to the
                               4827                 :                :  * OS.
                               4828                 :                :  */
                               4829                 :                : void
 3048 andres@anarazel.de       4830                 :             29 : FlushOneBuffer(Buffer buffer)
                               4831                 :                : {
                               4832                 :                :     BufferDesc *bufHdr;
                               4833                 :                : 
                               4834                 :                :     /* currently not needed, but no fundamental reason not to support */
                               4835         [ -  + ]:             29 :     Assert(!BufferIsLocal(buffer));
                               4836                 :                : 
                               4837   [ -  +  -  +  :             29 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
                               4838                 :                : 
                               4839                 :             29 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               4840                 :                : 
 3043 rhaas@postgresql.org     4841         [ -  + ]:             29 :     Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
                               4842                 :                : 
  430 andres@anarazel.de       4843                 :             29 :     FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
 3048                          4844                 :             29 : }
                               4845                 :                : 
                               4846                 :                : /*
                               4847                 :                :  * ReleaseBuffer -- release the pin on a buffer
                               4848                 :                :  */
                               4849                 :                : void
 7922 tgl@sss.pgh.pa.us        4850                 :       52419906 : ReleaseBuffer(Buffer buffer)
                               4851                 :                : {
 7121                          4852         [ -  + ]:       52419906 :     if (!BufferIsValid(buffer))
 4683 peter_e@gmx.net          4853         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               4854                 :                : 
 9716 bruce@momjian.us         4855         [ +  + ]:CBC    52419906 :     if (BufferIsLocal(buffer))
  375 andres@anarazel.de       4856                 :        1510442 :         UnpinLocalBuffer(buffer);
                               4857                 :                :     else
                               4858                 :       50909464 :         UnpinBuffer(GetBufferDescriptor(buffer - 1));
10141 scrappy@hub.org          4859                 :       52419906 : }
                               4860                 :                : 
                               4861                 :                : /*
                               4862                 :                :  * UnlockReleaseBuffer -- release the content lock and pin on a buffer
                               4863                 :                :  *
                               4864                 :                :  * This is just a shorthand for a common combination.
                               4865                 :                :  */
                               4866                 :                : void
 6589 tgl@sss.pgh.pa.us        4867                 :       15729463 : UnlockReleaseBuffer(Buffer buffer)
                               4868                 :                : {
                               4869                 :       15729463 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                               4870                 :       15729463 :     ReleaseBuffer(buffer);
                               4871                 :       15729463 : }
                               4872                 :                : 
                               4873                 :                : /*
                               4874                 :                :  * IncrBufferRefCount
                               4875                 :                :  *      Increment the pin count on a buffer that we have *already* pinned
                               4876                 :                :  *      at least once.
                               4877                 :                :  *
                               4878                 :                :  *      This function cannot be used on a buffer we do not have pinned,
                               4879                 :                :  *      because it doesn't change the shared buffer state.
                               4880                 :                :  */
                               4881                 :                : void
 7211                          4882                 :        9558731 : IncrBufferRefCount(Buffer buffer)
                               4883                 :                : {
 7081 neilc@samurai.com        4884   [ -  +  +  +  :        9558731 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
  158 heikki.linnakangas@i     4885                 :GNC     9558731 :     ResourceOwnerEnlarge(CurrentResourceOwner);
 7211 tgl@sss.pgh.pa.us        4886         [ +  + ]:CBC     9558731 :     if (BufferIsLocal(buffer))
                               4887                 :         347096 :         LocalRefCount[-buffer - 1]++;
                               4888                 :                :     else
                               4889                 :                :     {
                               4890                 :                :         PrivateRefCountEntry *ref;
                               4891                 :                : 
 3373 andres@anarazel.de       4892                 :        9211635 :         ref = GetPrivateRefCountEntry(buffer, true);
 3515                          4893         [ -  + ]:        9211635 :         Assert(ref != NULL);
                               4894                 :        9211635 :         ref->refcount++;
                               4895                 :                :     }
 2349 tgl@sss.pgh.pa.us        4896                 :        9558731 :     ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
 7211                          4897                 :        9558731 : }
                               4898                 :                : 
                               4899                 :                : /*
                               4900                 :                :  * MarkBufferDirtyHint
                               4901                 :                :  *
                               4902                 :                :  *  Mark a buffer dirty for non-critical changes.
                               4903                 :                :  *
                               4904                 :                :  * This is essentially the same as MarkBufferDirty, except:
                               4905                 :                :  *
                               4906                 :                :  * 1. The caller does not write WAL; so if checksums are enabled, we may need
                               4907                 :                :  *    to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
                               4908                 :                :  * 2. The caller might have only share-lock instead of exclusive-lock on the
                               4909                 :                :  *    buffer's content lock.
                               4910                 :                :  * 3. This function does not guarantee that the buffer is always marked dirty
                               4911                 :                :  *    (due to a race condition), so it cannot be used for important changes.
                               4912                 :                :  */
                               4913                 :                : void
 3954 jdavis@postgresql.or     4914                 :        9470838 : MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
                               4915                 :                : {
                               4916                 :                :     BufferDesc *bufHdr;
 2916 kgrittn@postgresql.o     4917                 :        9470838 :     Page        page = BufferGetPage(buffer);
                               4918                 :                : 
 7121 tgl@sss.pgh.pa.us        4919         [ -  + ]:        9470838 :     if (!BufferIsValid(buffer))
 4683 peter_e@gmx.net          4920         [ #  # ]:UBC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               4921                 :                : 
 8771 tgl@sss.pgh.pa.us        4922         [ +  + ]:CBC     9470838 :     if (BufferIsLocal(buffer))
                               4923                 :                :     {
 6589                          4924                 :         679437 :         MarkLocalBufferDirty(buffer);
 8771                          4925                 :         679437 :         return;
                               4926                 :                :     }
                               4927                 :                : 
 3363 andres@anarazel.de       4928                 :        8791401 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               4929                 :                : 
 3515                          4930         [ -  + ]:        8791401 :     Assert(GetPrivateRefCount(buffer) > 0);
                               4931                 :                :     /* here, either share or exclusive lock is OK */
 3043 rhaas@postgresql.org     4932         [ -  + ]:        8791401 :     Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
                               4933                 :                : 
                               4934                 :                :     /*
                               4935                 :                :      * This routine might get called many times on the same page, if we are
                               4936                 :                :      * making the first scan after commit of an xact that added/deleted many
                               4937                 :                :      * tuples. So, be as quick as we can if the buffer is already dirty.  We
                               4938                 :                :      * do this by not acquiring spinlock if it looks like the status bits are
                               4939                 :                :      * already set.  Since we make this test unlocked, there's a chance we
                               4940                 :                :      * might fail to notice that the flags have just been cleared, and failed
                               4941                 :                :      * to reset them, due to memory-ordering issues.  But since this function
                               4942                 :                :      * is only intended to be used in cases where failing to write out the
                               4943                 :                :      * data would be harmless anyway, it doesn't really matter.
                               4944                 :                :      */
 2926 andres@anarazel.de       4945         [ +  + ]:        8791401 :     if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
                               4946                 :                :         (BM_DIRTY | BM_JUST_DIRTIED))
                               4947                 :                :     {
 4041 simon@2ndQuadrant.co     4948                 :         927733 :         XLogRecPtr  lsn = InvalidXLogRecPtr;
                               4949                 :         927733 :         bool        dirtied = false;
  737 rhaas@postgresql.org     4950                 :         927733 :         bool        delayChkptFlags = false;
                               4951                 :                :         uint32      buf_state;
                               4952                 :                : 
                               4953                 :                :         /*
                               4954                 :                :          * If we need to protect hint bit updates from torn writes, WAL-log a
                               4955                 :                :          * full page image of the page. This full page image is only necessary
                               4956                 :                :          * if the hint bit update is the first change to the page since the
                               4957                 :                :          * last checkpoint.
                               4958                 :                :          *
                               4959                 :                :          * We don't check full_page_writes here because that logic is included
                               4960                 :                :          * when we call XLogInsert() since the value changes dynamically.
                               4961                 :                :          */
 2926 andres@anarazel.de       4962   [ +  +  +  +  :        1841274 :         if (XLogHintBitIsNeeded() &&
                                              +  + ]
                               4963                 :         913541 :             (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
                               4964                 :                :         {
                               4965                 :                :             /*
                               4966                 :                :              * If we must not write WAL, due to a relfilelocator-specific
                               4967                 :                :              * condition or being in recovery, don't dirty the page.  We can
                               4968                 :                :              * set the hint, just not dirty the page as a result so the hint
                               4969                 :                :              * is lost when we evict the page or shutdown.
                               4970                 :                :              *
                               4971                 :                :              * See src/backend/storage/page/README for longer discussion.
                               4972                 :                :              */
 1471 noah@leadboat.com        4973   [ +  +  -  + ]:         953519 :             if (RecoveryInProgress() ||
  599 rhaas@postgresql.org     4974                 :          39981 :                 RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
 4041 simon@2ndQuadrant.co     4975                 :         873557 :                 return;
                               4976                 :                : 
                               4977                 :                :             /*
                               4978                 :                :              * If the block is already dirty because we either made a change
                               4979                 :                :              * or set a hint already, then we don't need to write a full page
                               4980                 :                :              * image.  Note that aggressive cleaning of blocks dirtied by hint
                               4981                 :                :              * bit setting would increase the call rate. Bulk setting of hint
                               4982                 :                :              * bits would reduce the call rate...
                               4983                 :                :              *
                               4984                 :                :              * We must issue the WAL record before we mark the buffer dirty.
                               4985                 :                :              * Otherwise we might write the page before we write the WAL. That
                               4986                 :                :              * causes a race condition, since a checkpoint might occur between
                               4987                 :                :              * writing the WAL record and marking the buffer dirty. We solve
                               4988                 :                :              * that with a kluge, but one that is already in use during
                               4989                 :                :              * transaction commit to prevent race conditions. Basically, we
                               4990                 :                :              * simply prevent the checkpoint WAL record from being written
                               4991                 :                :              * until we have marked the buffer dirty. We don't start the
                               4992                 :                :              * checkpoint flush until we have marked dirty, so our checkpoint
                               4993                 :                :              * must flush the change to disk successfully or the checkpoint
                               4994                 :                :              * never gets written, so crash recovery will fix.
                               4995                 :                :              *
                               4996                 :                :              * It's possible we may enter here without an xid, so it is
                               4997                 :                :              * essential that CreateCheckPoint waits for virtual transactions
                               4998                 :                :              * rather than full transactionids.
                               4999                 :                :              */
  737 rhaas@postgresql.org     5000         [ -  + ]:          39981 :             Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
                               5001                 :          39981 :             MyProc->delayChkptFlags |= DELAY_CHKPT_START;
                               5002                 :          39981 :             delayChkptFlags = true;
 3954 jdavis@postgresql.or     5003                 :          39981 :             lsn = XLogSaveBufferForHint(buffer, buffer_std);
                               5004                 :                :         }
                               5005                 :                : 
 2926 andres@anarazel.de       5006                 :          54176 :         buf_state = LockBufHdr(bufHdr);
                               5007                 :                : 
                               5008         [ -  + ]:          54176 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               5009                 :                : 
                               5010         [ +  + ]:          54176 :         if (!(buf_state & BM_DIRTY))
                               5011                 :                :         {
 4041 simon@2ndQuadrant.co     5012                 :          54157 :             dirtied = true;     /* Means "will be dirtied by this action" */
                               5013                 :                : 
                               5014                 :                :             /*
                               5015                 :                :              * Set the page LSN if we wrote a backup block. We aren't supposed
                               5016                 :                :              * to set this when only holding a share lock but as long as we
                               5017                 :                :              * serialise it somehow we're OK. We choose to set LSN while
                               5018                 :                :              * holding the buffer header lock, which causes any reader of an
                               5019                 :                :              * LSN who holds only a share lock to also obtain a buffer header
                               5020                 :                :              * lock before using PageGetLSN(), which is enforced in
                               5021                 :                :              * BufferGetLSNAtomic().
                               5022                 :                :              *
                               5023                 :                :              * If checksums are enabled, you might think we should reset the
                               5024                 :                :              * checksum here. That will happen when the page is written
                               5025                 :                :              * sometime later in this checkpoint cycle.
                               5026                 :                :              */
                               5027         [ +  + ]:          54157 :             if (!XLogRecPtrIsInvalid(lsn))
                               5028                 :           6970 :                 PageSetLSN(page, lsn);
                               5029                 :                :         }
                               5030                 :                : 
 2926 andres@anarazel.de       5031                 :          54176 :         buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
                               5032                 :          54176 :         UnlockBufHdr(bufHdr, buf_state);
                               5033                 :                : 
  737 rhaas@postgresql.org     5034         [ +  + ]:          54176 :         if (delayChkptFlags)
                               5035                 :          39981 :             MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
                               5036                 :                : 
 4041 simon@2ndQuadrant.co     5037         [ +  + ]:          54176 :         if (dirtied)
                               5038                 :                :         {
 4524 alvherre@alvh.no-ip.     5039                 :          54157 :             VacuumPageDirty++;
 3667 rhaas@postgresql.org     5040                 :          54157 :             pgBufferUsage.shared_blks_dirtied++;
 4524 alvherre@alvh.no-ip.     5041         [ +  + ]:          54157 :             if (VacuumCostActive)
                               5042                 :           1860 :                 VacuumCostBalance += VacuumCostPageDirty;
                               5043                 :                :         }
                               5044                 :                :     }
                               5045                 :                : }
                               5046                 :                : 
                               5047                 :                : /*
                               5048                 :                :  * Release buffer content locks for shared buffers.
                               5049                 :                :  *
                               5050                 :                :  * Used to clean up after errors.
                               5051                 :                :  *
                               5052                 :                :  * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
                               5053                 :                :  * of releasing buffer content locks per se; the only thing we need to deal
                               5054                 :                :  * with here is clearing any PIN_COUNT request that was in progress.
                               5055                 :                :  */
                               5056                 :                : void
 8493 tgl@sss.pgh.pa.us        5057                 :          45427 : UnlockBuffers(void)
                               5058                 :                : {
 3072 rhaas@postgresql.org     5059                 :          45427 :     BufferDesc *buf = PinCountWaitBuf;
                               5060                 :                : 
 7120 tgl@sss.pgh.pa.us        5061         [ -  + ]:          45427 :     if (buf)
                               5062                 :                :     {
                               5063                 :                :         uint32      buf_state;
                               5064                 :                : 
 2926 andres@anarazel.de       5065                 :UBC           0 :         buf_state = LockBufHdr(buf);
                               5066                 :                : 
                               5067                 :                :         /*
                               5068                 :                :          * Don't complain if flag bit not set; it could have been reset but we
                               5069                 :                :          * got a cancel/die interrupt before getting the signal.
                               5070                 :                :          */
                               5071         [ #  # ]:              0 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
   52 heikki.linnakangas@i     5072         [ #  # ]:UNC           0 :             buf->wait_backend_pgprocno == MyProcNumber)
 2926 andres@anarazel.de       5073                 :UBC           0 :             buf_state &= ~BM_PIN_COUNT_WAITER;
                               5074                 :                : 
                               5075                 :              0 :         UnlockBufHdr(buf, buf_state);
                               5076                 :                : 
 6981 tgl@sss.pgh.pa.us        5077                 :              0 :         PinCountWaitBuf = NULL;
                               5078                 :                :     }
 9252 vadim4o@yahoo.com        5079                 :CBC       45427 : }
                               5080                 :                : 
                               5081                 :                : /*
                               5082                 :                :  * Acquire or release the content_lock for the buffer.
                               5083                 :                :  */
                               5084                 :                : void
 9091 bruce@momjian.us         5085                 :      150592046 : LockBuffer(Buffer buffer, int mode)
                               5086                 :                : {
                               5087                 :                :     BufferDesc *buf;
                               5088                 :                : 
 1364 pg@bowt.ie               5089   [ -  +  +  +  :      150592046 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 9252 vadim4o@yahoo.com        5090         [ +  + ]:      150592046 :     if (BufferIsLocal(buffer))
 6589 tgl@sss.pgh.pa.us        5091                 :        9828845 :         return;                 /* local buffers need no lock */
                               5092                 :                : 
 3363 andres@anarazel.de       5093                 :      140763201 :     buf = GetBufferDescriptor(buffer - 1);
                               5094                 :                : 
 9252 vadim4o@yahoo.com        5095         [ +  + ]:      140763201 :     if (mode == BUFFER_LOCK_UNLOCK)
 3043 rhaas@postgresql.org     5096                 :       71091205 :         LWLockRelease(BufferDescriptorGetContentLock(buf));
 9252 vadim4o@yahoo.com        5097         [ +  + ]:       69671996 :     else if (mode == BUFFER_LOCK_SHARE)
 3043 rhaas@postgresql.org     5098                 :       49703457 :         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
 9252 vadim4o@yahoo.com        5099         [ +  - ]:       19968539 :     else if (mode == BUFFER_LOCK_EXCLUSIVE)
 3043 rhaas@postgresql.org     5100                 :       19968539 :         LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
                               5101                 :                :     else
 7570 tgl@sss.pgh.pa.us        5102         [ #  # ]:UBC           0 :         elog(ERROR, "unrecognized buffer lock mode: %d", mode);
                               5103                 :                : }
                               5104                 :                : 
                               5105                 :                : /*
                               5106                 :                :  * Acquire the content_lock for the buffer, but only if we don't have to wait.
                               5107                 :                :  *
                               5108                 :                :  * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
                               5109                 :                :  */
                               5110                 :                : bool
 7553 tgl@sss.pgh.pa.us        5111                 :CBC     1309073 : ConditionalLockBuffer(Buffer buffer)
                               5112                 :                : {
                               5113                 :                :     BufferDesc *buf;
                               5114                 :                : 
 1364 pg@bowt.ie               5115   [ -  +  +  +  :        1309073 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 7553 tgl@sss.pgh.pa.us        5116         [ +  + ]:        1309073 :     if (BufferIsLocal(buffer))
                               5117                 :          64786 :         return true;            /* act as though we got it */
                               5118                 :                : 
 3363 andres@anarazel.de       5119                 :        1244287 :     buf = GetBufferDescriptor(buffer - 1);
                               5120                 :                : 
 3043 rhaas@postgresql.org     5121                 :        1244287 :     return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
                               5122                 :                :                                     LW_EXCLUSIVE);
                               5123                 :                : }
                               5124                 :                : 
                               5125                 :                : /*
                               5126                 :                :  * Verify that this backend is pinning the buffer exactly once.
                               5127                 :                :  *
                               5128                 :                :  * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
                               5129                 :                :  * holds a pin on the buffer.  We do not care whether some other backend does.
                               5130                 :                :  */
                               5131                 :                : void
  375 andres@anarazel.de       5132                 :        3191689 : CheckBufferIsPinnedOnce(Buffer buffer)
                               5133                 :                : {
                               5134         [ +  + ]:        3191689 :     if (BufferIsLocal(buffer))
                               5135                 :                :     {
                               5136         [ -  + ]:             16 :         if (LocalRefCount[-buffer - 1] != 1)
  375 andres@anarazel.de       5137         [ #  # ]:UBC           0 :             elog(ERROR, "incorrect local pin count: %d",
                               5138                 :                :                  LocalRefCount[-buffer - 1]);
                               5139                 :                :     }
                               5140                 :                :     else
                               5141                 :                :     {
  375 andres@anarazel.de       5142         [ -  + ]:CBC     3191673 :         if (GetPrivateRefCount(buffer) != 1)
  375 andres@anarazel.de       5143         [ #  # ]:UBC           0 :             elog(ERROR, "incorrect local pin count: %d",
                               5144                 :                :                  GetPrivateRefCount(buffer));
                               5145                 :                :     }
  375 andres@anarazel.de       5146                 :CBC     3191689 : }
                               5147                 :                : 
                               5148                 :                : /*
                               5149                 :                :  * LockBufferForCleanup - lock a buffer in preparation for deleting items
                               5150                 :                :  *
                               5151                 :                :  * Items may be deleted from a disk page only when the caller (a) holds an
                               5152                 :                :  * exclusive lock on the buffer and (b) has observed that no other backend
                               5153                 :                :  * holds a pin on the buffer.  If there is a pin, then the other backend
                               5154                 :                :  * might have a pointer into the buffer (for example, a heapscan reference
                               5155                 :                :  * to an item --- see README for more details).  It's OK if a pin is added
                               5156                 :                :  * after the cleanup starts, however; the newly-arrived backend will be
                               5157                 :                :  * unable to look at the page until we release the exclusive lock.
                               5158                 :                :  *
                               5159                 :                :  * To implement this protocol, a would-be deleter must pin the buffer and
                               5160                 :                :  * then call LockBufferForCleanup().  LockBufferForCleanup() is similar to
                               5161                 :                :  * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
                               5162                 :                :  * it has successfully observed pin count = 1.
                               5163                 :                :  */
                               5164                 :                : void
 8318 tgl@sss.pgh.pa.us        5165                 :          21773 : LockBufferForCleanup(Buffer buffer)
                               5166                 :                : {
                               5167                 :                :     BufferDesc *bufHdr;
 1192 fujii@postgresql.org     5168                 :          21773 :     TimestampTz waitStart = 0;
  419 drowley@postgresql.o     5169                 :          21773 :     bool        waiting = false;
 1192 fujii@postgresql.org     5170                 :          21773 :     bool        logged_recovery_conflict = false;
                               5171                 :                : 
 1364 pg@bowt.ie               5172   [ -  +  +  +  :          21773 :     Assert(BufferIsPinned(buffer));
                                              -  + ]
 7120 tgl@sss.pgh.pa.us        5173         [ -  + ]:          21773 :     Assert(PinCountWaitBuf == NULL);
                               5174                 :                : 
  375 andres@anarazel.de       5175                 :          21773 :     CheckBufferIsPinnedOnce(buffer);
                               5176                 :                : 
                               5177                 :                :     /* Nobody else to wait for */
 8318 tgl@sss.pgh.pa.us        5178         [ +  + ]:          21773 :     if (BufferIsLocal(buffer))
                               5179                 :             16 :         return;
                               5180                 :                : 
 3363 andres@anarazel.de       5181                 :          21757 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               5182                 :                : 
                               5183                 :                :     for (;;)
 8318 tgl@sss.pgh.pa.us        5184                 :GBC          11 :     {
                               5185                 :                :         uint32      buf_state;
                               5186                 :                : 
                               5187                 :                :         /* Try to acquire lock */
 8318 tgl@sss.pgh.pa.us        5188                 :CBC       21768 :         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 2926 andres@anarazel.de       5189                 :          21768 :         buf_state = LockBufHdr(bufHdr);
                               5190                 :                : 
                               5191         [ -  + ]:          21768 :         Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               5192         [ +  + ]:          21768 :         if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               5193                 :                :         {
                               5194                 :                :             /* Successfully acquired exclusive lock with pincount 1 */
                               5195                 :          21757 :             UnlockBufHdr(bufHdr, buf_state);
                               5196                 :                : 
                               5197                 :                :             /*
                               5198                 :                :              * Emit the log message if recovery conflict on buffer pin was
                               5199                 :                :              * resolved but the startup process waited longer than
                               5200                 :                :              * deadlock_timeout for it.
                               5201                 :                :              */
 1187 fujii@postgresql.org     5202         [ +  + ]:          21757 :             if (logged_recovery_conflict)
 1187 fujii@postgresql.org     5203                 :GBC           2 :                 LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
                               5204                 :                :                                     waitStart, GetCurrentTimestamp(),
                               5205                 :                :                                     NULL, false);
                               5206                 :                : 
  419 drowley@postgresql.o     5207         [ +  + ]:CBC       21757 :             if (waiting)
                               5208                 :                :             {
                               5209                 :                :                 /* reset ps display to remove the suffix if we added one */
  419 drowley@postgresql.o     5210                 :GBC           2 :                 set_ps_display_remove_suffix();
                               5211                 :              2 :                 waiting = false;
                               5212                 :                :             }
 8318 tgl@sss.pgh.pa.us        5213                 :CBC       21757 :             return;
                               5214                 :                :         }
                               5215                 :                :         /* Failed, so mark myself as waiting for pincount 1 */
 2926 andres@anarazel.de       5216         [ -  + ]:GBC          11 :         if (buf_state & BM_PIN_COUNT_WAITER)
                               5217                 :                :         {
 2926 andres@anarazel.de       5218                 :UBC           0 :             UnlockBufHdr(bufHdr, buf_state);
 8318 tgl@sss.pgh.pa.us        5219                 :              0 :             LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 7570                          5220         [ #  # ]:              0 :             elog(ERROR, "multiple backends attempting to wait for pincount 1");
                               5221                 :                :         }
   52 heikki.linnakangas@i     5222                 :GNC          11 :         bufHdr->wait_backend_pgprocno = MyProcNumber;
 7120 tgl@sss.pgh.pa.us        5223                 :GBC          11 :         PinCountWaitBuf = bufHdr;
 2926 andres@anarazel.de       5224                 :             11 :         buf_state |= BM_PIN_COUNT_WAITER;
                               5225                 :             11 :         UnlockBufHdr(bufHdr, buf_state);
 8318 tgl@sss.pgh.pa.us        5226                 :             11 :         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                               5227                 :                : 
                               5228                 :                :         /* Wait to be signaled by UnpinBuffer() */
 5195 simon@2ndQuadrant.co     5229         [ +  - ]:             11 :         if (InHotStandby)
                               5230                 :                :         {
  419 drowley@postgresql.o     5231         [ +  + ]:             11 :             if (!waiting)
                               5232                 :                :             {
                               5233                 :                :                 /* adjust the process title to indicate that it's waiting */
                               5234                 :              2 :                 set_ps_display_suffix("waiting");
                               5235                 :              2 :                 waiting = true;
                               5236                 :                :             }
                               5237                 :                : 
                               5238                 :                :             /*
                               5239                 :                :              * Emit the log message if the startup process is waiting longer
                               5240                 :                :              * than deadlock_timeout for recovery conflict on buffer pin.
                               5241                 :                :              *
                               5242                 :                :              * Skip this if first time through because the startup process has
                               5243                 :                :              * not started waiting yet in this case. So, the wait start
                               5244                 :                :              * timestamp is set after this logic.
                               5245                 :                :              */
 1192 fujii@postgresql.org     5246   [ +  +  +  + ]:             11 :             if (waitStart != 0 && !logged_recovery_conflict)
                               5247                 :                :             {
                               5248                 :              4 :                 TimestampTz now = GetCurrentTimestamp();
                               5249                 :                : 
                               5250         [ +  + ]:              4 :                 if (TimestampDifferenceExceeds(waitStart, now,
                               5251                 :                :                                                DeadlockTimeout))
                               5252                 :                :                 {
                               5253                 :              2 :                     LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
                               5254                 :                :                                         waitStart, now, NULL, true);
                               5255                 :              2 :                     logged_recovery_conflict = true;
                               5256                 :                :                 }
                               5257                 :                :             }
                               5258                 :                : 
                               5259                 :                :             /*
                               5260                 :                :              * Set the wait start timestamp if logging is enabled and first
                               5261                 :                :              * time through.
                               5262                 :                :              */
                               5263   [ +  -  +  + ]:             11 :             if (log_recovery_conflict_waits && waitStart == 0)
                               5264                 :              2 :                 waitStart = GetCurrentTimestamp();
                               5265                 :                : 
                               5266                 :                :             /* Publish the bufid that Startup process waits on */
 5195 simon@2ndQuadrant.co     5267                 :             11 :             SetStartupBufferPinWaitBufId(buffer - 1);
                               5268                 :                :             /* Set alarm and then wait to be signaled by UnpinBuffer() */
                               5269                 :             11 :             ResolveRecoveryConflictWithBufferPin();
                               5270                 :                :             /* Reset the published bufid */
                               5271                 :             11 :             SetStartupBufferPinWaitBufId(-1);
                               5272                 :                :         }
                               5273                 :                :         else
  286 michael@paquier.xyz      5274                 :UNC           0 :             ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
                               5275                 :                : 
                               5276                 :                :         /*
                               5277                 :                :          * Remove flag marking us as waiter. Normally this will not be set
                               5278                 :                :          * anymore, but ProcWaitForSignal() can return for other signals as
                               5279                 :                :          * well.  We take care to only reset the flag if we're the waiter, as
                               5280                 :                :          * theoretically another backend could have started waiting. That's
                               5281                 :                :          * impossible with the current usages due to table level locking, but
                               5282                 :                :          * better be safe.
                               5283                 :                :          */
 2926 andres@anarazel.de       5284                 :GBC          11 :         buf_state = LockBufHdr(bufHdr);
                               5285         [ +  + ]:             11 :         if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
   52 heikki.linnakangas@i     5286         [ +  - ]:GNC           9 :             bufHdr->wait_backend_pgprocno == MyProcNumber)
 2926 andres@anarazel.de       5287                 :GBC           9 :             buf_state &= ~BM_PIN_COUNT_WAITER;
                               5288                 :             11 :         UnlockBufHdr(bufHdr, buf_state);
                               5289                 :                : 
 7120 tgl@sss.pgh.pa.us        5290                 :             11 :         PinCountWaitBuf = NULL;
                               5291                 :                :         /* Loop back and try again */
                               5292                 :                :     }
                               5293                 :                : }
                               5294                 :                : 
                               5295                 :                : /*
                               5296                 :                :  * Check called from ProcessRecoveryConflictInterrupts() when Startup process
                               5297                 :                :  * requests cancellation of all pin holders that are blocking it.
                               5298                 :                :  */
                               5299                 :                : bool
 5195 simon@2ndQuadrant.co     5300                 :              4 : HoldingBufferPinThatDelaysRecovery(void)
                               5301                 :                : {
 5161 bruce@momjian.us         5302                 :              4 :     int         bufid = GetStartupBufferPinWaitBufId();
                               5303                 :                : 
                               5304                 :                :     /*
                               5305                 :                :      * If we get woken slowly then it's possible that the Startup process was
                               5306                 :                :      * already woken by other backends before we got here. Also possible that
                               5307                 :                :      * we get here by multiple interrupts or interrupts at inappropriate
                               5308                 :                :      * times, so make sure we do nothing if the bufid is not set.
                               5309                 :                :      */
 5195 simon@2ndQuadrant.co     5310         [ +  + ]:              4 :     if (bufid < 0)
                               5311                 :              2 :         return false;
                               5312                 :                : 
 3515 andres@anarazel.de       5313         [ +  - ]:              2 :     if (GetPrivateRefCount(bufid + 1) > 0)
 5195 simon@2ndQuadrant.co     5314                 :              2 :         return true;
                               5315                 :                : 
 5195 simon@2ndQuadrant.co     5316                 :UBC           0 :     return false;
                               5317                 :                : }
                               5318                 :                : 
                               5319                 :                : /*
                               5320                 :                :  * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
                               5321                 :                :  *
                               5322                 :                :  * We won't loop, but just check once to see if the pin count is OK.  If
                               5323                 :                :  * not, return false with no lock held.
                               5324                 :                :  */
                               5325                 :                : bool
 6051 tgl@sss.pgh.pa.us        5326                 :CBC      363571 : ConditionalLockBufferForCleanup(Buffer buffer)
                               5327                 :                : {
                               5328                 :                :     BufferDesc *bufHdr;
                               5329                 :                :     uint32      buf_state,
                               5330                 :                :                 refcount;
                               5331                 :                : 
                               5332         [ -  + ]:         363571 :     Assert(BufferIsValid(buffer));
                               5333                 :                : 
                               5334         [ +  + ]:         363571 :     if (BufferIsLocal(buffer))
                               5335                 :                :     {
 2926 andres@anarazel.de       5336                 :            785 :         refcount = LocalRefCount[-buffer - 1];
                               5337                 :                :         /* There should be exactly one pin */
                               5338         [ -  + ]:            785 :         Assert(refcount > 0);
                               5339         [ +  + ]:            785 :         if (refcount != 1)
 6051 tgl@sss.pgh.pa.us        5340                 :             21 :             return false;
                               5341                 :                :         /* Nobody else to wait for */
                               5342                 :            764 :         return true;
                               5343                 :                :     }
                               5344                 :                : 
                               5345                 :                :     /* There should be exactly one local pin */
 2926 andres@anarazel.de       5346                 :         362786 :     refcount = GetPrivateRefCount(buffer);
                               5347         [ -  + ]:         362786 :     Assert(refcount);
                               5348         [ +  + ]:         362786 :     if (refcount != 1)
 6051 tgl@sss.pgh.pa.us        5349                 :            139 :         return false;
                               5350                 :                : 
                               5351                 :                :     /* Try to acquire lock */
                               5352         [ +  + ]:         362647 :     if (!ConditionalLockBuffer(buffer))
                               5353                 :             64 :         return false;
                               5354                 :                : 
 3363 andres@anarazel.de       5355                 :         362583 :     bufHdr = GetBufferDescriptor(buffer - 1);
 2926                          5356                 :         362583 :     buf_state = LockBufHdr(bufHdr);
                               5357                 :         362583 :     refcount = BUF_STATE_GET_REFCOUNT(buf_state);
                               5358                 :                : 
                               5359         [ -  + ]:         362583 :     Assert(refcount > 0);
                               5360         [ +  + ]:         362583 :     if (refcount == 1)
                               5361                 :                :     {
                               5362                 :                :         /* Successfully acquired exclusive lock with pincount 1 */
                               5363                 :         362524 :         UnlockBufHdr(bufHdr, buf_state);
 6051 tgl@sss.pgh.pa.us        5364                 :         362524 :         return true;
                               5365                 :                :     }
                               5366                 :                : 
                               5367                 :                :     /* Failed, so release the lock */
 2926 andres@anarazel.de       5368                 :             59 :     UnlockBufHdr(bufHdr, buf_state);
 6051 tgl@sss.pgh.pa.us        5369                 :             59 :     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                               5370                 :             59 :     return false;
                               5371                 :                : }
                               5372                 :                : 
                               5373                 :                : /*
                               5374                 :                :  * IsBufferCleanupOK - as above, but we already have the lock
                               5375                 :                :  *
                               5376                 :                :  * Check whether it's OK to perform cleanup on a buffer we've already
                               5377                 :                :  * locked.  If we observe that the pin count is 1, our exclusive lock
                               5378                 :                :  * happens to be a cleanup lock, and we can proceed with anything that
                               5379                 :                :  * would have been allowable had we sought a cleanup lock originally.
                               5380                 :                :  */
                               5381                 :                : bool
 2718 rhaas@postgresql.org     5382                 :           2030 : IsBufferCleanupOK(Buffer buffer)
                               5383                 :                : {
                               5384                 :                :     BufferDesc *bufHdr;
                               5385                 :                :     uint32      buf_state;
                               5386                 :                : 
                               5387         [ -  + ]:           2030 :     Assert(BufferIsValid(buffer));
                               5388                 :                : 
                               5389         [ -  + ]:           2030 :     if (BufferIsLocal(buffer))
                               5390                 :                :     {
                               5391                 :                :         /* There should be exactly one pin */
 2718 rhaas@postgresql.org     5392         [ #  # ]:UBC           0 :         if (LocalRefCount[-buffer - 1] != 1)
                               5393                 :              0 :             return false;
                               5394                 :                :         /* Nobody else to wait for */
                               5395                 :              0 :         return true;
                               5396                 :                :     }
                               5397                 :                : 
                               5398                 :                :     /* There should be exactly one local pin */
 2718 rhaas@postgresql.org     5399         [ -  + ]:CBC        2030 :     if (GetPrivateRefCount(buffer) != 1)
 2718 rhaas@postgresql.org     5400                 :UBC           0 :         return false;
                               5401                 :                : 
 2718 rhaas@postgresql.org     5402                 :CBC        2030 :     bufHdr = GetBufferDescriptor(buffer - 1);
                               5403                 :                : 
                               5404                 :                :     /* caller must hold exclusive lock on buffer */
                               5405         [ -  + ]:           2030 :     Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
                               5406                 :                :                                 LW_EXCLUSIVE));
                               5407                 :                : 
                               5408                 :           2030 :     buf_state = LockBufHdr(bufHdr);
                               5409                 :                : 
                               5410         [ -  + ]:           2030 :     Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
                               5411         [ +  - ]:           2030 :     if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
                               5412                 :                :     {
                               5413                 :                :         /* pincount is OK. */
                               5414                 :           2030 :         UnlockBufHdr(bufHdr, buf_state);
                               5415                 :           2030 :         return true;
                               5416                 :                :     }
                               5417                 :                : 
 2718 rhaas@postgresql.org     5418                 :UBC           0 :     UnlockBufHdr(bufHdr, buf_state);
                               5419                 :              0 :     return false;
                               5420                 :                : }
                               5421                 :                : 
                               5422                 :                : 
                               5423                 :                : /*
                               5424                 :                :  *  Functions for buffer I/O handling
                               5425                 :                :  *
                               5426                 :                :  *  Note: We assume that nested buffer I/O never occurs.
                               5427                 :                :  *  i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
                               5428                 :                :  *
                               5429                 :                :  *  Also note that these are used only for shared buffers, not local ones.
                               5430                 :                :  */
                               5431                 :                : 
                               5432                 :                : /*
                               5433                 :                :  * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
                               5434                 :                :  */
                               5435                 :                : static void
 3072 rhaas@postgresql.org     5436                 :CBC        4409 : WaitIO(BufferDesc *buf)
                               5437                 :                : {
 1130 tmunro@postgresql.or     5438                 :           4409 :     ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
                               5439                 :                : 
                               5440                 :           4409 :     ConditionVariablePrepareToSleep(cv);
                               5441                 :                :     for (;;)
 6981 tgl@sss.pgh.pa.us        5442                 :           4021 :     {
                               5443                 :                :         uint32      buf_state;
                               5444                 :                : 
                               5445                 :                :         /*
                               5446                 :                :          * It may not be necessary to acquire the spinlock to check the flag
                               5447                 :                :          * here, but since this test is essential for correctness, we'd better
                               5448                 :                :          * play it safe.
                               5449                 :                :          */
 2926 andres@anarazel.de       5450                 :           8430 :         buf_state = LockBufHdr(buf);
                               5451                 :           8430 :         UnlockBufHdr(buf, buf_state);
                               5452                 :                : 
                               5453         [ +  + ]:           8430 :         if (!(buf_state & BM_IO_IN_PROGRESS))
 6981 tgl@sss.pgh.pa.us        5454                 :           4409 :             break;
 1130 tmunro@postgresql.or     5455                 :           4021 :         ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
                               5456                 :                :     }
                               5457                 :           4409 :     ConditionVariableCancelSleep();
 6981 tgl@sss.pgh.pa.us        5458                 :           4409 : }
                               5459                 :                : 
                               5460                 :                : /*
                               5461                 :                :  * StartBufferIO: begin I/O on this buffer
                               5462                 :                :  *  (Assumptions)
                               5463                 :                :  *  My process is executing no IO
                               5464                 :                :  *  The buffer is Pinned
                               5465                 :                :  *
                               5466                 :                :  * In some scenarios there are race conditions in which multiple backends
                               5467                 :                :  * could attempt the same I/O operation concurrently.  If someone else
                               5468                 :                :  * has already started I/O on this buffer then we will block on the
                               5469                 :                :  * I/O condition variable until he's done.
                               5470                 :                :  *
                               5471                 :                :  * Input operations are only attempted on buffers that are not BM_VALID,
                               5472                 :                :  * and output operations only on buffers that are BM_VALID and BM_DIRTY,
                               5473                 :                :  * so we can always tell if the work is already done.
                               5474                 :                :  *
                               5475                 :                :  * Returns true if we successfully marked the buffer as I/O busy,
                               5476                 :                :  * false if someone else already did the work.
                               5477                 :                :  *
                               5478                 :                :  * If nowait is true, then we don't wait for an I/O to be finished by another
                               5479                 :                :  * backend.  In that case, false indicates either that the I/O was already
                               5480                 :                :  * finished, or is still in progress.  This is useful for callers that want to
                               5481                 :                :  * find out if they can perform the I/O as part of a larger operation, without
                               5482                 :                :  * waiting for the answer or distinguishing the reasons why not.
                               5483                 :                :  */
                               5484                 :                : static bool
   11 tmunro@postgresql.or     5485                 :GNC     1881120 : StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
                               5486                 :                : {
                               5487                 :                :     uint32      buf_state;
                               5488                 :                : 
  158 heikki.linnakangas@i     5489                 :        1881120 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               5490                 :                : 
                               5491                 :                :     for (;;)
                               5492                 :                :     {
 2926 andres@anarazel.de       5493                 :CBC     1885528 :         buf_state = LockBufHdr(buf);
                               5494                 :                : 
                               5495         [ +  + ]:        1885528 :         if (!(buf_state & BM_IO_IN_PROGRESS))
 6981 tgl@sss.pgh.pa.us        5496                 :        1881120 :             break;
 2926 andres@anarazel.de       5497                 :           4408 :         UnlockBufHdr(buf, buf_state);
   11 tmunro@postgresql.or     5498         [ -  + ]:GNC        4408 :         if (nowait)
   11 tmunro@postgresql.or     5499                 :UNC           0 :             return false;
 6981 tgl@sss.pgh.pa.us        5500                 :CBC        4408 :         WaitIO(buf);
                               5501                 :                :     }
                               5502                 :                : 
                               5503                 :                :     /* Once we get here, there is definitely no I/O active on this buffer */
                               5504                 :                : 
 2926 andres@anarazel.de       5505   [ +  +  +  + ]:        1881120 :     if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
                               5506                 :                :     {
                               5507                 :                :         /* someone else already did the I/O */
                               5508                 :           5935 :         UnlockBufHdr(buf, buf_state);
 6981 tgl@sss.pgh.pa.us        5509                 :           5935 :         return false;
                               5510                 :                :     }
                               5511                 :                : 
 2926 andres@anarazel.de       5512                 :        1875185 :     buf_state |= BM_IO_IN_PROGRESS;
                               5513                 :        1875185 :     UnlockBufHdr(buf, buf_state);
                               5514                 :                : 
  375                          5515                 :        1875185 :     ResourceOwnerRememberBufferIO(CurrentResourceOwner,
                               5516                 :                :                                   BufferDescriptorGetBuffer(buf));
                               5517                 :                : 
 6981 tgl@sss.pgh.pa.us        5518                 :        1875185 :     return true;
                               5519                 :                : }
                               5520                 :                : 
                               5521                 :                : /*
                               5522                 :                :  * TerminateBufferIO: release a buffer we were doing I/O on
                               5523                 :                :  *  (Assumptions)
                               5524                 :                :  *  My process is executing IO for the buffer
                               5525                 :                :  *  BM_IO_IN_PROGRESS bit is set for the buffer
                               5526                 :                :  *  The buffer is Pinned
                               5527                 :                :  *
                               5528                 :                :  * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
                               5529                 :                :  * buffer's BM_DIRTY flag.  This is appropriate when terminating a
                               5530                 :                :  * successful write.  The check on BM_JUST_DIRTIED is necessary to avoid
                               5531                 :                :  * marking the buffer clean if it was re-dirtied while we were writing.
                               5532                 :                :  *
                               5533                 :                :  * set_flag_bits gets ORed into the buffer's flags.  It must include
                               5534                 :                :  * BM_IO_ERROR in a failure case.  For successful completion it could
                               5535                 :                :  * be 0, or BM_VALID if we just finished reading in the page.
                               5536                 :                :  *
                               5537                 :                :  * If forget_owner is true, we release the buffer I/O from the current
                               5538                 :                :  * resource owner. (forget_owner=false is used when the resource owner itself
                               5539                 :                :  * is being released)
                               5540                 :                :  */
                               5541                 :                : static void
  158 heikki.linnakangas@i     5542                 :GNC     1875185 : TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
                               5543                 :                :                   bool forget_owner)
                               5544                 :                : {
                               5545                 :                :     uint32      buf_state;
                               5546                 :                : 
 2926 andres@anarazel.de       5547                 :CBC     1875185 :     buf_state = LockBufHdr(buf);
                               5548                 :                : 
                               5549         [ -  + ]:        1875185 :     Assert(buf_state & BM_IO_IN_PROGRESS);
                               5550                 :                : 
                               5551                 :        1875185 :     buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
                               5552   [ +  +  +  + ]:        1875185 :     if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
                               5553                 :         530328 :         buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
                               5554                 :                : 
                               5555                 :        1875185 :     buf_state |= set_flag_bits;
                               5556                 :        1875185 :     UnlockBufHdr(buf, buf_state);
                               5557                 :                : 
  158 heikki.linnakangas@i     5558         [ +  + ]:GNC     1875185 :     if (forget_owner)
                               5559                 :        1875170 :         ResourceOwnerForgetBufferIO(CurrentResourceOwner,
                               5560                 :                :                                     BufferDescriptorGetBuffer(buf));
                               5561                 :                : 
 1130 tmunro@postgresql.or     5562                 :CBC     1875185 :     ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
 8854 inoue@tpf.co.jp          5563                 :        1875185 : }
                               5564                 :                : 
                               5565                 :                : /*
                               5566                 :                :  * AbortBufferIO: Clean up active buffer I/O after an error.
                               5567                 :                :  *
                               5568                 :                :  *  All LWLocks we might have held have been released,
                               5569                 :                :  *  but we haven't yet released buffer pins, so the buffer is still pinned.
                               5570                 :                :  *
                               5571                 :                :  *  If I/O was in progress, we always set BM_IO_ERROR, even though it's
                               5572                 :                :  *  possible the error condition wasn't related to the I/O.
                               5573                 :                :  *
                               5574                 :                :  *  Note: this does not remove the buffer I/O from the resource owner.
                               5575                 :                :  *  That's correct when we're releasing the whole resource owner, but
                               5576                 :                :  *  beware if you use this in other contexts.
                               5577                 :                :  */
                               5578                 :                : static void
  367 pg@bowt.ie               5579                 :             15 : AbortBufferIO(Buffer buffer)
                               5580                 :                : {
                               5581                 :             15 :     BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
                               5582                 :                :     uint32      buf_state;
                               5583                 :                : 
  375 andres@anarazel.de       5584                 :             15 :     buf_state = LockBufHdr(buf_hdr);
                               5585         [ -  + ]:             15 :     Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
                               5586                 :                : 
                               5587         [ +  - ]:             15 :     if (!(buf_state & BM_VALID))
                               5588                 :                :     {
                               5589         [ -  + ]:             15 :         Assert(!(buf_state & BM_DIRTY));
                               5590                 :             15 :         UnlockBufHdr(buf_hdr, buf_state);
                               5591                 :                :     }
                               5592                 :                :     else
                               5593                 :                :     {
  373 andres@anarazel.de       5594         [ #  # ]:UBC           0 :         Assert(buf_state & BM_DIRTY);
  375                          5595                 :              0 :         UnlockBufHdr(buf_hdr, buf_state);
                               5596                 :                : 
                               5597                 :                :         /* Issue notice if this is not the first failure... */
                               5598         [ #  # ]:              0 :         if (buf_state & BM_IO_ERROR)
                               5599                 :                :         {
                               5600                 :                :             /* Buffer is pinned, so we can read tag without spinlock */
                               5601                 :                :             char       *path;
                               5602                 :                : 
                               5603                 :              0 :             path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
                               5604                 :                :                                BufTagGetForkNum(&buf_hdr->tag));
                               5605         [ #  # ]:              0 :             ereport(WARNING,
                               5606                 :                :                     (errcode(ERRCODE_IO_ERROR),
                               5607                 :                :                      errmsg("could not write block %u of %s",
                               5608                 :                :                             buf_hdr->tag.blockNum, path),
                               5609                 :                :                      errdetail("Multiple failures --- write error might be permanent.")));
                               5610                 :              0 :             pfree(path);
                               5611                 :                :         }
                               5612                 :                :     }
                               5613                 :                : 
  158 heikki.linnakangas@i     5614                 :GNC          15 :     TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
 8854 inoue@tpf.co.jp          5615                 :CBC          15 : }
                               5616                 :                : 
                               5617                 :                : /*
                               5618                 :                :  * Error context callback for errors occurring during shared buffer writes.
                               5619                 :                :  */
                               5620                 :                : static void
 4993 rhaas@postgresql.org     5621                 :             41 : shared_buffer_write_error_callback(void *arg)
                               5622                 :                : {
 3072                          5623                 :             41 :     BufferDesc *bufHdr = (BufferDesc *) arg;
                               5624                 :                : 
                               5625                 :                :     /* Buffer is pinned, so we can read the tag without locking the spinlock */
 7645 tgl@sss.pgh.pa.us        5626         [ +  - ]:             41 :     if (bufHdr != NULL)
                               5627                 :                :     {
  599 rhaas@postgresql.org     5628                 :             41 :         char       *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
                               5629                 :                :                                        BufTagGetForkNum(&bufHdr->tag));
                               5630                 :                : 
 4993                          5631                 :             41 :         errcontext("writing block %u of relation %s",
                               5632                 :                :                    bufHdr->tag.blockNum, path);
                               5633                 :             41 :         pfree(path);
                               5634                 :                :     }
                               5635                 :             41 : }
                               5636                 :                : 
                               5637                 :                : /*
                               5638                 :                :  * Error context callback for errors occurring during local buffer writes.
                               5639                 :                :  */
                               5640                 :                : static void
 4993 rhaas@postgresql.org     5641                 :UBC           0 : local_buffer_write_error_callback(void *arg)
                               5642                 :                : {
 3072                          5643                 :              0 :     BufferDesc *bufHdr = (BufferDesc *) arg;
                               5644                 :                : 
 4993                          5645         [ #  # ]:              0 :     if (bufHdr != NULL)
                               5646                 :                :     {
  599                          5647                 :              0 :         char       *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
                               5648                 :                :                                           MyProcNumber,
                               5649                 :                :                                           BufTagGetForkNum(&bufHdr->tag));
                               5650                 :                : 
 5633 heikki.linnakangas@i     5651                 :              0 :         errcontext("writing block %u of relation %s",
                               5652                 :                :                    bufHdr->tag.blockNum, path);
                               5653                 :              0 :         pfree(path);
                               5654                 :                :     }
 7645 tgl@sss.pgh.pa.us        5655                 :              0 : }
                               5656                 :                : 
                               5657                 :                : /*
                               5658                 :                :  * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
                               5659                 :                :  */
                               5660                 :                : static int
  648 rhaas@postgresql.org     5661                 :CBC     9984884 : rlocator_comparator(const void *p1, const void *p2)
                               5662                 :                : {
                               5663                 :        9984884 :     RelFileLocator n1 = *(const RelFileLocator *) p1;
                               5664                 :        9984884 :     RelFileLocator n2 = *(const RelFileLocator *) p2;
                               5665                 :                : 
                               5666         [ +  + ]:        9984884 :     if (n1.relNumber < n2.relNumber)
 4105 alvherre@alvh.no-ip.     5667                 :        9155607 :         return -1;
  648 rhaas@postgresql.org     5668         [ +  + ]:         829277 :     else if (n1.relNumber > n2.relNumber)
 4105 alvherre@alvh.no-ip.     5669                 :         174163 :         return 1;
                               5670                 :                : 
  648 rhaas@postgresql.org     5671         [ +  + ]:         655114 :     if (n1.dbOid < n2.dbOid)
 4105 alvherre@alvh.no-ip.     5672                 :          55323 :         return -1;
  648 rhaas@postgresql.org     5673         [ +  + ]:         599791 :     else if (n1.dbOid > n2.dbOid)
 4105 alvherre@alvh.no-ip.     5674                 :          68490 :         return 1;
                               5675                 :                : 
  648 rhaas@postgresql.org     5676         [ -  + ]:         531301 :     if (n1.spcOid < n2.spcOid)
 4105 alvherre@alvh.no-ip.     5677                 :UBC           0 :         return -1;
  648 rhaas@postgresql.org     5678         [ -  + ]:CBC      531301 :     else if (n1.spcOid > n2.spcOid)
 4105 alvherre@alvh.no-ip.     5679                 :UBC           0 :         return 1;
                               5680                 :                :     else
 4105 alvherre@alvh.no-ip.     5681                 :CBC      531301 :         return 0;
                               5682                 :                : }
                               5683                 :                : 
                               5684                 :                : /*
                               5685                 :                :  * Lock buffer header - set BM_LOCKED in buffer state.
                               5686                 :                :  */
                               5687                 :                : uint32
 2926 andres@anarazel.de       5688                 :       26884984 : LockBufHdr(BufferDesc *desc)
                               5689                 :                : {
                               5690                 :                :     SpinDelayStatus delayStatus;
                               5691                 :                :     uint32      old_buf_state;
                               5692                 :                : 
  375                          5693         [ -  + ]:       26884984 :     Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
                               5694                 :                : 
 2922                          5695                 :       26884984 :     init_local_spin_delay(&delayStatus);
                               5696                 :                : 
                               5697                 :                :     while (true)
                               5698                 :                :     {
                               5699                 :                :         /* set BM_LOCKED flag */
 2926                          5700                 :       26910677 :         old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
                               5701                 :                :         /* if it wasn't set before we're OK */
                               5702         [ +  + ]:       26910677 :         if (!(old_buf_state & BM_LOCKED))
                               5703                 :       26884984 :             break;
                               5704                 :          25693 :         perform_spin_delay(&delayStatus);
                               5705                 :                :     }
                               5706                 :       26884984 :     finish_spin_delay(&delayStatus);
                               5707                 :       26884984 :     return old_buf_state | BM_LOCKED;
                               5708                 :                : }
                               5709                 :                : 
                               5710                 :                : /*
                               5711                 :                :  * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
                               5712                 :                :  * state at that point.
                               5713                 :                :  *
                               5714                 :                :  * Obviously the buffer could be locked by the time the value is returned, so
                               5715                 :                :  * this is primarily useful in CAS style loops.
                               5716                 :                :  */
                               5717                 :                : static uint32
                               5718                 :           2134 : WaitBufHdrUnlocked(BufferDesc *buf)
                               5719                 :                : {
                               5720                 :                :     SpinDelayStatus delayStatus;
                               5721                 :                :     uint32      buf_state;
                               5722                 :                : 
 2922                          5723                 :           2134 :     init_local_spin_delay(&delayStatus);
                               5724                 :                : 
 2926                          5725                 :           2134 :     buf_state = pg_atomic_read_u32(&buf->state);
                               5726                 :                : 
                               5727         [ +  + ]:          18387 :     while (buf_state & BM_LOCKED)
                               5728                 :                :     {
                               5729                 :          16253 :         perform_spin_delay(&delayStatus);
                               5730                 :          16253 :         buf_state = pg_atomic_read_u32(&buf->state);
                               5731                 :                :     }
                               5732                 :                : 
                               5733                 :           2134 :     finish_spin_delay(&delayStatus);
                               5734                 :                : 
                               5735                 :           2134 :     return buf_state;
                               5736                 :                : }
                               5737                 :                : 
                               5738                 :                : /*
                               5739                 :                :  * BufferTag comparator.
                               5740                 :                :  */
                               5741                 :                : static inline int
 1129 tmunro@postgresql.or     5742                 :         934672 : buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
                               5743                 :                : {
                               5744                 :                :     int         ret;
                               5745                 :                :     RelFileLocator rlocatora;
                               5746                 :                :     RelFileLocator rlocatorb;
                               5747                 :                : 
  599 rhaas@postgresql.org     5748                 :         934672 :     rlocatora = BufTagGetRelFileLocator(ba);
                               5749                 :         934672 :     rlocatorb = BufTagGetRelFileLocator(bb);
                               5750                 :                : 
                               5751                 :         934672 :     ret = rlocator_comparator(&rlocatora, &rlocatorb);
                               5752                 :                : 
 2977 andres@anarazel.de       5753         [ +  + ]:         934672 :     if (ret != 0)
                               5754                 :         405030 :         return ret;
                               5755                 :                : 
  599 rhaas@postgresql.org     5756         [ +  + ]:         529642 :     if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
 2977 andres@anarazel.de       5757                 :          30700 :         return -1;
  599 rhaas@postgresql.org     5758         [ +  + ]:         498942 :     if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
 2977 andres@anarazel.de       5759                 :          19368 :         return 1;
                               5760                 :                : 
                               5761         [ +  + ]:         479574 :     if (ba->blockNum < bb->blockNum)
                               5762                 :         314257 :         return -1;
                               5763         [ +  + ]:         165317 :     if (ba->blockNum > bb->blockNum)
                               5764                 :         163952 :         return 1;
                               5765                 :                : 
                               5766                 :           1365 :     return 0;
                               5767                 :                : }
                               5768                 :                : 
                               5769                 :                : /*
                               5770                 :                :  * Comparator determining the writeout order in a checkpoint.
                               5771                 :                :  *
                               5772                 :                :  * It is important that tablespaces are compared first, the logic balancing
                               5773                 :                :  * writes between tablespaces relies on it.
                               5774                 :                :  */
                               5775                 :                : static inline int
 1129 tmunro@postgresql.or     5776                 :        2549484 : ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
                               5777                 :                : {
                               5778                 :                :     /* compare tablespace */
 2977 andres@anarazel.de       5779         [ +  + ]:        2549484 :     if (a->tsId < b->tsId)
                               5780                 :           4006 :         return -1;
                               5781         [ +  + ]:        2545478 :     else if (a->tsId > b->tsId)
                               5782                 :          15564 :         return 1;
                               5783                 :                :     /* compare relation */
  648 rhaas@postgresql.org     5784         [ +  + ]:        2529914 :     if (a->relNumber < b->relNumber)
 2977 andres@anarazel.de       5785                 :         714848 :         return -1;
  648 rhaas@postgresql.org     5786         [ +  + ]:        1815066 :     else if (a->relNumber > b->relNumber)
 2977 andres@anarazel.de       5787                 :         694620 :         return 1;
                               5788                 :                :     /* compare fork */
                               5789         [ +  + ]:        1120446 :     else if (a->forkNum < b->forkNum)
                               5790                 :          50932 :         return -1;
                               5791         [ +  + ]:        1069514 :     else if (a->forkNum > b->forkNum)
                               5792                 :          52000 :         return 1;
                               5793                 :                :     /* compare block number */
                               5794         [ +  + ]:        1017514 :     else if (a->blockNum < b->blockNum)
                               5795                 :         492826 :         return -1;
 2286 tgl@sss.pgh.pa.us        5796         [ +  + ]:         524688 :     else if (a->blockNum > b->blockNum)
 2977 andres@anarazel.de       5797                 :         479915 :         return 1;
                               5798                 :                :     /* equal page IDs are unlikely, but not impossible */
 2286 tgl@sss.pgh.pa.us        5799                 :          44773 :     return 0;
                               5800                 :                : }
                               5801                 :                : 
                               5802                 :                : /*
                               5803                 :                :  * Comparator for a Min-Heap over the per-tablespace checkpoint completion
                               5804                 :                :  * progress.
                               5805                 :                :  */
                               5806                 :                : static int
 2977 andres@anarazel.de       5807                 :         204417 : ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
                               5808                 :                : {
                               5809                 :         204417 :     CkptTsStatus *sa = (CkptTsStatus *) a;
                               5810                 :         204417 :     CkptTsStatus *sb = (CkptTsStatus *) b;
                               5811                 :                : 
                               5812                 :                :     /* we want a min-heap, so return 1 for the a < b */
                               5813         [ +  + ]:         204417 :     if (sa->progress < sb->progress)
                               5814                 :         197434 :         return 1;
                               5815         [ +  + ]:           6983 :     else if (sa->progress == sb->progress)
                               5816                 :            501 :         return 0;
                               5817                 :                :     else
                               5818                 :           6482 :         return -1;
                               5819                 :                : }
                               5820                 :                : 
                               5821                 :                : /*
                               5822                 :                :  * Initialize a writeback context, discarding potential previous state.
                               5823                 :                :  *
                               5824                 :                :  * *max_pending is a pointer instead of an immediate value, so the coalesce
                               5825                 :                :  * limits can easily changed by the GUC mechanism, and so calling code does
                               5826                 :                :  * not have to check the current configuration. A value of 0 means that no
                               5827                 :                :  * writeback control will be performed.
                               5828                 :                :  */
                               5829                 :                : void
                               5830                 :           2388 : WritebackContextInit(WritebackContext *context, int *max_pending)
                               5831                 :                : {
                               5832         [ -  + ]:           2388 :     Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
                               5833                 :                : 
                               5834                 :           2388 :     context->max_pending = max_pending;
                               5835                 :           2388 :     context->nr_pending = 0;
                               5836                 :           2388 : }
                               5837                 :                : 
                               5838                 :                : /*
                               5839                 :                :  * Add buffer to list of pending writeback requests.
                               5840                 :                :  */
                               5841                 :                : void
  333                          5842                 :         526491 : ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
                               5843                 :                :                               BufferTag *tag)
                               5844                 :                : {
                               5845                 :                :     PendingWriteback *pending;
                               5846                 :                : 
  372 tmunro@postgresql.or     5847         [ +  + ]:         526491 :     if (io_direct_flags & IO_DIRECT_DATA)
                               5848                 :            553 :         return;
                               5849                 :                : 
                               5850                 :                :     /*
                               5851                 :                :      * Add buffer to the pending writeback array, unless writeback control is
                               5852                 :                :      * disabled.
                               5853                 :                :      */
  333 andres@anarazel.de       5854         [ +  + ]:         525938 :     if (*wb_context->max_pending > 0)
                               5855                 :                :     {
                               5856         [ -  + ]:         274213 :         Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
                               5857                 :                : 
                               5858                 :         274213 :         pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
                               5859                 :                : 
 2977                          5860                 :         274213 :         pending->tag = *tag;
                               5861                 :                :     }
                               5862                 :                : 
                               5863                 :                :     /*
                               5864                 :                :      * Perform pending flushes if the writeback limit is exceeded. This
                               5865                 :                :      * includes the case where previously an item has been added, but control
                               5866                 :                :      * is now disabled.
                               5867                 :                :      */
  333                          5868         [ +  + ]:         525938 :     if (wb_context->nr_pending >= *wb_context->max_pending)
                               5869                 :         259451 :         IssuePendingWritebacks(wb_context, io_context);
                               5870                 :                : }
                               5871                 :                : 
                               5872                 :                : #define ST_SORT sort_pending_writebacks
                               5873                 :                : #define ST_ELEMENT_TYPE PendingWriteback
                               5874                 :                : #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
                               5875                 :                : #define ST_SCOPE static
                               5876                 :                : #define ST_DEFINE
                               5877                 :                : #include <lib/sort_template.h>
                               5878                 :                : 
                               5879                 :                : /*
                               5880                 :                :  * Issue all pending writeback requests, previously scheduled with
                               5881                 :                :  * ScheduleBufferTagForWriteback, to the OS.
                               5882                 :                :  *
                               5883                 :                :  * Because this is only used to improve the OSs IO scheduling we try to never
                               5884                 :                :  * error out - it's just a hint.
                               5885                 :                :  */
                               5886                 :                : void
                               5887                 :         260200 : IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
                               5888                 :                : {
                               5889                 :                :     instr_time  io_start;
                               5890                 :                :     int         i;
                               5891                 :                : 
                               5892         [ +  + ]:         260200 :     if (wb_context->nr_pending == 0)
 2977                          5893                 :         251779 :         return;
                               5894                 :                : 
                               5895                 :                :     /*
                               5896                 :                :      * Executing the writes in-order can make them a lot faster, and allows to
                               5897                 :                :      * merge writeback requests to consecutive blocks into larger writebacks.
                               5898                 :                :      */
  333                          5899                 :           8421 :     sort_pending_writebacks(wb_context->pending_writebacks,
                               5900                 :           8421 :                             wb_context->nr_pending);
                               5901                 :                : 
  120 michael@paquier.xyz      5902                 :GNC        8421 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               5903                 :                : 
                               5904                 :                :     /*
                               5905                 :                :      * Coalesce neighbouring writes, but nothing else. For that we iterate
                               5906                 :                :      * through the, now sorted, array of pending flushes, and look forward to
                               5907                 :                :      * find all neighbouring (or identical) writes.
                               5908                 :                :      */
  333 andres@anarazel.de       5909         [ +  + ]:CBC       94076 :     for (i = 0; i < wb_context->nr_pending; i++)
                               5910                 :                :     {
                               5911                 :                :         PendingWriteback *cur;
                               5912                 :                :         PendingWriteback *next;
                               5913                 :                :         SMgrRelation reln;
                               5914                 :                :         int         ahead;
                               5915                 :                :         BufferTag   tag;
                               5916                 :                :         RelFileLocator currlocator;
 2977                          5917                 :          85655 :         Size        nblocks = 1;
                               5918                 :                : 
  333                          5919                 :          85655 :         cur = &wb_context->pending_writebacks[i];
 2977                          5920                 :          85655 :         tag = cur->tag;
  599 rhaas@postgresql.org     5921                 :          85655 :         currlocator = BufTagGetRelFileLocator(&tag);
                               5922                 :                : 
                               5923                 :                :         /*
                               5924                 :                :          * Peek ahead, into following writeback requests, to see if they can
                               5925                 :                :          * be combined with the current one.
                               5926                 :                :          */
  333 andres@anarazel.de       5927         [ +  + ]:         270987 :         for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
                               5928                 :                :         {
                               5929                 :                : 
                               5930                 :         262566 :             next = &wb_context->pending_writebacks[i + ahead + 1];
                               5931                 :                : 
                               5932                 :                :             /* different file, stop */
  599 rhaas@postgresql.org     5933   [ +  +  +  +  :         262566 :             if (!RelFileLocatorEquals(currlocator,
                                              +  - ]
                               5934         [ +  + ]:         208416 :                                       BufTagGetRelFileLocator(&next->tag)) ||
                               5935                 :         208416 :                 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
                               5936                 :                :                 break;
                               5937                 :                : 
                               5938                 :                :             /* ok, block queued twice, skip */
 2977 andres@anarazel.de       5939         [ +  + ]:         192351 :             if (cur->tag.blockNum == next->tag.blockNum)
                               5940                 :           1192 :                 continue;
                               5941                 :                : 
                               5942                 :                :             /* only merge consecutive writes */
                               5943         [ +  + ]:         191159 :             if (cur->tag.blockNum + 1 != next->tag.blockNum)
                               5944                 :           7019 :                 break;
                               5945                 :                : 
                               5946                 :         184140 :             nblocks++;
                               5947                 :         184140 :             cur = next;
                               5948                 :                :         }
                               5949                 :                : 
                               5950                 :          85655 :         i += ahead;
                               5951                 :                : 
                               5952                 :                :         /* and finally tell the kernel to write the data to storage */
   42 heikki.linnakangas@i     5953                 :GNC       85655 :         reln = smgropen(currlocator, INVALID_PROC_NUMBER);
  599 rhaas@postgresql.org     5954                 :CBC       85655 :         smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
                               5955                 :                :     }
                               5956                 :                : 
                               5957                 :                :     /*
                               5958                 :                :      * Assume that writeback requests are only issued for buffers containing
                               5959                 :                :      * blocks of permanent relations.
                               5960                 :                :      */
  333 andres@anarazel.de       5961                 :           8421 :     pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
                               5962                 :           8421 :                             IOOP_WRITEBACK, io_start, wb_context->nr_pending);
                               5963                 :                : 
                               5964                 :           8421 :     wb_context->nr_pending = 0;
                               5965                 :                : }
                               5966                 :                : 
                               5967                 :                : /* ResourceOwner callbacks */
                               5968                 :                : 
                               5969                 :                : static void
  158 heikki.linnakangas@i     5970                 :GNC          15 : ResOwnerReleaseBufferIO(Datum res)
                               5971                 :                : {
                               5972                 :             15 :     Buffer      buffer = DatumGetInt32(res);
                               5973                 :                : 
                               5974                 :             15 :     AbortBufferIO(buffer);
                               5975                 :             15 : }
                               5976                 :                : 
                               5977                 :                : static char *
  158 heikki.linnakangas@i     5978                 :UNC           0 : ResOwnerPrintBufferIO(Datum res)
                               5979                 :                : {
                               5980                 :              0 :     Buffer      buffer = DatumGetInt32(res);
                               5981                 :                : 
                               5982                 :              0 :     return psprintf("lost track of buffer IO on buffer %d", buffer);
                               5983                 :                : }
                               5984                 :                : 
                               5985                 :                : static void
  158 heikki.linnakangas@i     5986                 :GNC        4128 : ResOwnerReleaseBufferPin(Datum res)
                               5987                 :                : {
                               5988                 :           4128 :     Buffer      buffer = DatumGetInt32(res);
                               5989                 :                : 
                               5990                 :                :     /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
                               5991         [ -  + ]:           4128 :     if (!BufferIsValid(buffer))
  158 heikki.linnakangas@i     5992         [ #  # ]:UNC           0 :         elog(ERROR, "bad buffer ID: %d", buffer);
                               5993                 :                : 
  158 heikki.linnakangas@i     5994         [ +  + ]:GNC        4128 :     if (BufferIsLocal(buffer))
                               5995                 :            377 :         UnpinLocalBufferNoOwner(buffer);
                               5996                 :                :     else
                               5997                 :           3751 :         UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1));
                               5998                 :           4128 : }
                               5999                 :                : 
                               6000                 :                : static char *
  158 heikki.linnakangas@i     6001                 :UNC           0 : ResOwnerPrintBufferPin(Datum res)
                               6002                 :                : {
                               6003                 :              0 :     return DebugPrintBufferRefcount(DatumGetInt32(res));
                               6004                 :                : }
                               6005                 :                : 
                               6006                 :                : /*
                               6007                 :                :  * Try to evict the current block in a shared buffer.
                               6008                 :                :  *
                               6009                 :                :  * This function is intended for testing/development use only!
                               6010                 :                :  *
                               6011                 :                :  * To succeed, the buffer must not be pinned on entry, so if the caller had a
                               6012                 :                :  * particular block in mind, it might already have been replaced by some other
                               6013                 :                :  * block by the time this function runs.  It's also unpinned on return, so the
                               6014                 :                :  * buffer might be occupied again by the time control is returned, potentially
                               6015                 :                :  * even by the same block.  This inherent raciness without other interlocking
                               6016                 :                :  * makes the function unsuitable for non-testing usage.
                               6017                 :                :  *
                               6018                 :                :  * Returns true if the buffer was valid and it has now been made invalid.
                               6019                 :                :  * Returns false if it wasn't valid, if it couldn't be evicted due to a pin,
                               6020                 :                :  * or if the buffer becomes dirty again while we're trying to write it out.
                               6021                 :                :  */
                               6022                 :                : bool
    7 tmunro@postgresql.or     6023                 :              0 : EvictUnpinnedBuffer(Buffer buf)
                               6024                 :                : {
                               6025                 :                :     BufferDesc *desc;
                               6026                 :                :     uint32      buf_state;
                               6027                 :                :     bool        result;
                               6028                 :                : 
                               6029                 :                :     /* Make sure we can pin the buffer. */
                               6030                 :              0 :     ResourceOwnerEnlarge(CurrentResourceOwner);
                               6031                 :              0 :     ReservePrivateRefCountEntry();
                               6032                 :                : 
                               6033         [ #  # ]:              0 :     Assert(!BufferIsLocal(buf));
                               6034                 :              0 :     desc = GetBufferDescriptor(buf - 1);
                               6035                 :                : 
                               6036                 :                :     /* Lock the header and check if it's valid. */
                               6037                 :              0 :     buf_state = LockBufHdr(desc);
                               6038         [ #  # ]:              0 :     if ((buf_state & BM_VALID) == 0)
                               6039                 :                :     {
                               6040                 :              0 :         UnlockBufHdr(desc, buf_state);
                               6041                 :              0 :         return false;
                               6042                 :                :     }
                               6043                 :                : 
                               6044                 :                :     /* Check that it's not pinned already. */
                               6045         [ #  # ]:              0 :     if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
                               6046                 :                :     {
                               6047                 :              0 :         UnlockBufHdr(desc, buf_state);
                               6048                 :              0 :         return false;
                               6049                 :                :     }
                               6050                 :                : 
                               6051                 :              0 :     PinBuffer_Locked(desc);     /* releases spinlock */
                               6052                 :                : 
                               6053                 :                :     /* If it was dirty, try to clean it once. */
                               6054         [ #  # ]:              0 :     if (buf_state & BM_DIRTY)
                               6055                 :                :     {
                               6056                 :              0 :         LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED);
                               6057                 :              0 :         FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
                               6058                 :              0 :         LWLockRelease(BufferDescriptorGetContentLock(desc));
                               6059                 :                :     }
                               6060                 :                : 
                               6061                 :                :     /* This will return false if it becomes dirty or someone else pins it. */
                               6062                 :              0 :     result = InvalidateVictimBuffer(desc);
                               6063                 :                : 
                               6064                 :              0 :     UnpinBuffer(desc);
                               6065                 :                : 
                               6066                 :              0 :     return result;
    7 tmunro@postgresql.or     6067                 :ECB       (648) : }
        

Generated by: LCOV version 2.1-beta2-3-g6141622